refactor(media): move provider defaults into media metadata

This commit is contained in:
Peter Steinberger 2026-04-04 07:00:35 +01:00
parent fca80d2ee2
commit 3a3f88a80a
No known key found for this signature in database
26 changed files with 308 additions and 148 deletions

View File

@ -7,6 +7,9 @@ import {
export const anthropicMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "anthropic",
capabilities: ["image"],
defaultModels: { image: "claude-opus-4-6" },
autoPriority: { image: 20 },
nativeDocumentInputs: ["pdf"],
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
};

View File

@ -4,5 +4,7 @@ import { transcribeDeepgramAudio } from "./audio.js";
export const deepgramMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "deepgram",
capabilities: ["audio"],
defaultModels: { audio: "nova-3" },
autoPriority: { audio: 30 },
transcribeAudio: transcribeDeepgramAudio,
};

View File

@ -155,6 +155,13 @@ export async function describeGeminiVideo(
export const googleMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "google",
capabilities: ["image", "audio", "video"],
defaultModels: {
image: DEFAULT_GOOGLE_VIDEO_MODEL,
audio: DEFAULT_GOOGLE_AUDIO_MODEL,
video: DEFAULT_GOOGLE_VIDEO_MODEL,
},
autoPriority: { image: 30, audio: 40, video: 10 },
nativeDocumentInputs: ["pdf"],
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
transcribeAudio: transcribeGeminiAudio,

View File

@ -9,6 +9,8 @@ const DEFAULT_GROQ_AUDIO_MODEL = "whisper-large-v3-turbo";
export const groqMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "groq",
capabilities: ["audio"],
defaultModels: { audio: DEFAULT_GROQ_AUDIO_MODEL },
autoPriority: { audio: 20 },
transcribeAudio: (req) =>
transcribeOpenAiCompatibleAudio({
...req,

View File

@ -7,6 +7,8 @@ import {
export const minimaxMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "minimax",
capabilities: ["image"],
defaultModels: { image: "MiniMax-VL-01" },
autoPriority: { image: 40 },
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
};
@ -14,6 +16,8 @@ export const minimaxMediaUnderstandingProvider: MediaUnderstandingProvider = {
export const minimaxPortalMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "minimax-portal",
capabilities: ["image"],
defaultModels: { image: "MiniMax-VL-01" },
autoPriority: { image: 50 },
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
};

View File

@ -9,6 +9,8 @@ const DEFAULT_MISTRAL_AUDIO_MODEL = "voxtral-mini-latest";
export const mistralMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "mistral",
capabilities: ["audio"],
defaultModels: { audio: DEFAULT_MISTRAL_AUDIO_MODEL },
autoPriority: { audio: 50 },
transcribeAudio: async (req) =>
await transcribeOpenAiCompatibleAudio({
...req,

View File

@ -126,6 +126,8 @@ export async function describeMoonshotVideo(
export const moonshotMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "moonshot",
capabilities: ["image", "video"],
defaultModels: { image: "kimi-k2.5", video: DEFAULT_MOONSHOT_VIDEO_MODEL },
autoPriority: { video: 20 },
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
describeVideo: describeMoonshotVideo,

View File

@ -21,6 +21,11 @@ export async function transcribeOpenAiAudio(params: AudioTranscriptionRequest) {
export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "openai",
capabilities: ["image", "audio"],
defaultModels: {
image: "gpt-5.4-mini",
audio: OPENAI_DEFAULT_AUDIO_TRANSCRIPTION_MODEL,
},
autoPriority: { image: 10, audio: 10 },
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
transcribeAudio: transcribeOpenAiAudio,
@ -29,6 +34,7 @@ export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
export const openaiCodexMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "openai-codex",
capabilities: ["image"],
defaultModels: { image: "gpt-5.4" },
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
};

View File

@ -7,6 +7,7 @@ import {
export const openrouterMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "openrouter",
capabilities: ["image"],
defaultModels: { image: "auto" },
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
};

View File

@ -11,9 +11,9 @@ vi.mock("openclaw/plugin-sdk/agent-runtime", () => ({
}));
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
AUTO_IMAGE_KEY_PROVIDERS: ["openai"],
DEFAULT_IMAGE_MODELS: { openai: "gpt-4.1-mini" },
resolveAutoImageModel: vi.fn(async () => null),
resolveAutoMediaKeyProviders: vi.fn(() => ["openai"]),
resolveDefaultMediaModel: vi.fn(() => "gpt-4.1-mini"),
}));
vi.mock("openclaw/plugin-sdk/media-understanding-runtime", () => ({

View File

@ -9,8 +9,11 @@ import {
import { resolveDefaultModelForAgent } from "openclaw/plugin-sdk/agent-runtime";
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
import { loadJsonFile, saveJsonFile } from "openclaw/plugin-sdk/json-store";
import { AUTO_IMAGE_KEY_PROVIDERS, DEFAULT_IMAGE_MODELS } from "openclaw/plugin-sdk/media-runtime";
import { resolveAutoImageModel } from "openclaw/plugin-sdk/media-runtime";
import {
resolveAutoMediaKeyProviders,
resolveDefaultMediaModel,
} from "openclaw/plugin-sdk/media-runtime";
import { describeImageFileWithModel } from "openclaw/plugin-sdk/media-understanding-runtime";
import { logVerbose } from "openclaw/plugin-sdk/runtime-env";
import { STATE_DIR } from "openclaw/plugin-sdk/state-paths";
@ -182,6 +185,11 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
}
};
const autoProviders = resolveAutoMediaKeyProviders({
cfg,
capability: "image",
});
const selectCatalogModel = (provider: string) => {
const entries = catalog.filter(
(entry) =>
@ -190,7 +198,11 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
if (entries.length === 0) {
return undefined;
}
const defaultId = DEFAULT_IMAGE_MODELS[provider];
const defaultId = resolveDefaultMediaModel({
cfg,
providerId: provider,
capability: "image",
});
const preferred = entries.find((entry) => entry.id === defaultId);
return preferred ?? entries[0];
};
@ -198,16 +210,14 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
let resolved = null as { provider: string; model?: string } | null;
if (
activeModel &&
AUTO_IMAGE_KEY_PROVIDERS.includes(
activeModel.provider as (typeof AUTO_IMAGE_KEY_PROVIDERS)[number],
) &&
autoProviders.includes(activeModel.provider) &&
(await hasProviderKey(activeModel.provider))
) {
resolved = activeModel;
}
if (!resolved) {
for (const provider of AUTO_IMAGE_KEY_PROVIDERS) {
for (const provider of autoProviders) {
if (!(await hasProviderKey(provider))) {
continue;
}

View File

@ -7,6 +7,8 @@ import {
export const zaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
id: "zai",
capabilities: ["image"],
defaultModels: { image: "glm-4.6v" },
autoPriority: { image: 60 },
describeImage: describeImageWithModel,
describeImages: describeImagesWithModel,
};

View File

@ -126,11 +126,12 @@ function resolveImageGenerationModelCandidates(
providerDefaults.set(providerId, `${providerId}/${modelId}`);
}
const primaryProvider = resolveDefaultModelRef(cfg).provider;
const orderedProviders = [
resolveDefaultModelRef(cfg).provider,
"openai",
"google",
...providerDefaults.keys(),
primaryProvider,
...[...providerDefaults.keys()]
.filter((providerId) => providerId !== primaryProvider)
.toSorted(),
];
const orderedRefs: string[] = [];
const seen = new Set<string>();

View File

@ -274,7 +274,7 @@ function createMinimaxImageConfig(): OpenClawConfig {
function createDefaultImageFallbackExpectation(primary: string) {
return {
primary,
fallbacks: ["openai/gpt-5-mini", "anthropic/claude-opus-4-5"],
fallbacks: ["openai/gpt-5.4-mini", "anthropic/claude-opus-4-6"],
};
}
@ -618,12 +618,12 @@ describe("image tool implicit imageModel config", () => {
agents: {
defaults: {
model: { primary: "minimax/MiniMax-M2.7" },
imageModel: { primary: "openai/gpt-5-mini" },
imageModel: { primary: "openai/gpt-5.4-mini" },
},
},
};
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "openai/gpt-5-mini",
primary: "openai/gpt-5.4-mini",
});
});
});
@ -638,7 +638,7 @@ describe("image tool implicit imageModel config", () => {
agents: {
defaults: {
model: { primary: "acme/vision-1" },
imageModel: { primary: "openai/gpt-5-mini" },
imageModel: { primary: "openai/gpt-5.4-mini" },
},
},
models: {
@ -652,7 +652,7 @@ describe("image tool implicit imageModel config", () => {
};
// Tool should still be available for explicit image analysis requests
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "openai/gpt-5-mini",
primary: "openai/gpt-5.4-mini",
});
const tool = createImageTool({ config: cfg, agentDir, modelHasVision: true });
expect(tool).not.toBeNull();
@ -1229,7 +1229,7 @@ describe("image tool response validation", () => {
role: "assistant",
api: "openai-responses",
provider: "openai",
model: "gpt-5-mini",
model: "gpt-5.4-mini",
stopReason: "stop",
timestamp: Date.now(),
usage: makeZeroUsageSnapshot(),
@ -1278,7 +1278,7 @@ describe("image tool response validation", () => {
expect(() =>
__testing.coerceImageAssistantText({
provider: "openai",
model: "gpt-5-mini",
model: "gpt-5.4-mini",
message,
}),
).toThrow(expectedError);

View File

@ -1,6 +1,10 @@
import { resolve, isAbsolute } from "node:path";
import { Type } from "@sinclair/typebox";
import type { OpenClawConfig } from "../../config/config.js";
import {
resolveAutoMediaKeyProviders,
resolveDefaultMediaModel,
} from "../../media-understanding/defaults.js";
import { getMediaUnderstandingProvider } from "../../media-understanding/provider-registry.js";
import { buildProviderRegistry } from "../../media-understanding/runner.js";
import { loadWebMedia } from "../../media/web-media.js";
@ -40,8 +44,6 @@ import {
} from "./tool-runtime.helpers.js";
const DEFAULT_PROMPT = "Describe the image.";
const ANTHROPIC_IMAGE_PRIMARY = "anthropic/claude-opus-4-6";
const ANTHROPIC_IMAGE_FALLBACK = "anthropic/claude-opus-4-5";
const DEFAULT_MAX_IMAGES = 20;
const imageToolProviderDeps = {
@ -103,28 +105,39 @@ export function resolveImageModelConfigForTool(params: {
provider: primary.provider,
});
const primaryCandidates = (() => {
if (isMinimaxVlmProvider(primary.provider)) {
return [`${primary.provider}/MiniMax-VL-01`];
}
if (providerVisionFromConfig) {
return [providerVisionFromConfig];
}
if (primary.provider === "zai") {
return ["zai/glm-4.6v"];
const providerDefault = resolveDefaultMediaModel({
cfg: params.cfg,
providerId: primary.provider,
capability: "image",
});
if (providerDefault) {
return [`${primary.provider}/${providerDefault}`];
}
if (primary.provider === "openai") {
return ["openai/gpt-5-mini"];
}
if (primary.provider === "anthropic") {
return [ANTHROPIC_IMAGE_PRIMARY];
if (isMinimaxVlmProvider(primary.provider)) {
return [`${primary.provider}/MiniMax-VL-01`];
}
return [];
})();
const autoCandidates = resolveAutoMediaKeyProviders({
cfg: params.cfg,
capability: "image",
}).map((providerId) => {
const modelId = resolveDefaultMediaModel({
cfg: params.cfg,
providerId,
capability: "image",
});
return modelId ? `${providerId}/${modelId}` : null;
});
return buildToolModelConfigFromCandidates({
explicit,
agentDir: params.agentDir,
candidates: [...primaryCandidates, "openai/gpt-5-mini", ANTHROPIC_IMAGE_FALLBACK],
candidates: [...primaryCandidates, ...autoCandidates],
});
}

View File

@ -4,22 +4,16 @@ import {
resolveAgentModelFallbackValues,
resolveAgentModelPrimaryValue,
} from "../../config/model-input.js";
import { providerSupportsNativePdfDocument } from "../../media-understanding/defaults.js";
import { extractAssistantText } from "../pi-embedded-utils.js";
export type PdfModelConfig = { primary?: string; fallbacks?: string[] };
/**
* Providers known to support native PDF document input.
* When the model's provider is in this set, the tool sends raw PDF bytes
* via provider-specific API calls instead of extracting text/images first.
*/
export const NATIVE_PDF_PROVIDERS = new Set(["anthropic", "google"]);
/**
* Check whether a provider supports native PDF document input.
*/
export function providerSupportsNativePdf(provider: string): boolean {
return NATIVE_PDF_PROVIDERS.has(provider.toLowerCase().trim());
return providerSupportsNativePdfDocument({ providerId: provider });
}
/**

View File

@ -46,7 +46,7 @@ async function withTempAgentDir<T>(run: (agentDir: string) => Promise<T>): Promi
}
const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6";
const OPENAI_PDF_MODEL = "openai/gpt-5-mini";
const OPENAI_PDF_MODEL = "openai/gpt-5.4-mini";
const TEST_PDF_INPUT = { base64: "dGVzdA==", filename: "doc.pdf" } as const;
const FAKE_PDF_MEDIA = {
kind: "document",
@ -295,12 +295,12 @@ describe("resolvePdfModelConfigForTool", () => {
agents: {
defaults: {
model: { primary: "openai/gpt-5.4" },
imageModel: { primary: "openai/gpt-5-mini" },
imageModel: { primary: "openai/gpt-5.4-mini" },
},
},
};
expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({
primary: "openai/gpt-5-mini",
primary: "openai/gpt-5.4-mini",
});
});
});

View File

@ -1,6 +1,11 @@
import { type Context, complete } from "@mariozechner/pi-ai";
import { Type } from "@sinclair/typebox";
import type { OpenClawConfig } from "../../config/config.js";
import {
providerSupportsNativePdfDocument,
resolveAutoMediaKeyProviders,
resolveDefaultMediaModel,
} from "../../media-understanding/defaults.js";
import { extractPdfContent, type PdfExtractedContent } from "../../media/pdf-extract.js";
import { loadWebMediaRaw } from "../../media/web-media.js";
import { resolveUserPath } from "../../utils.js";
@ -43,8 +48,6 @@ const DEFAULT_PROMPT = "Analyze this PDF document.";
const DEFAULT_MAX_PDFS = 10;
const DEFAULT_MAX_BYTES_MB = 10;
const DEFAULT_MAX_PAGES = 20;
const ANTHROPIC_PDF_PRIMARY = "anthropic/claude-opus-4-6";
const ANTHROPIC_PDF_FALLBACK = "anthropic/claude-opus-4-5";
const PDF_MIN_TEXT_CHARS = 200;
const PDF_MAX_PIXELS = 4_000_000;
@ -75,9 +78,7 @@ export function resolvePdfModelConfigForTool(params: {
// Auto-detect from available providers
const primary = resolveDefaultModelRef(params.cfg);
const anthropicOk = hasAuthForProvider({ provider: "anthropic", agentDir: params.agentDir });
const googleOk = hasAuthForProvider({ provider: "google", agentDir: params.agentDir });
const openaiOk = hasAuthForProvider({ provider: "openai", agentDir: params.agentDir });
const fallbacks: string[] = [];
const addFallback = (ref: string) => {
@ -95,30 +96,54 @@ export function resolvePdfModelConfigForTool(params: {
cfg: params.cfg,
provider: primary.provider,
});
const providerDefault = resolveDefaultMediaModel({
cfg: params.cfg,
providerId: primary.provider,
capability: "image",
});
const nativePdfCandidates = resolveAutoMediaKeyProviders({
cfg: params.cfg,
capability: "image",
})
.filter((providerId) => providerSupportsNativePdfDocument({ cfg: params.cfg, providerId }))
.filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir }))
.map((providerId) => {
const modelId = resolveDefaultMediaModel({
cfg: params.cfg,
providerId,
capability: "image",
});
return modelId ? `${providerId}/${modelId}` : null;
})
.filter((value): value is string => Boolean(value));
const genericImageCandidates = resolveAutoMediaKeyProviders({
cfg: params.cfg,
capability: "image",
})
.filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir }))
.map((providerId) => {
const modelId = resolveDefaultMediaModel({
cfg: params.cfg,
providerId,
capability: "image",
});
return modelId ? `${providerId}/${modelId}` : null;
})
.filter((value): value is string => Boolean(value));
if (primary.provider === "anthropic" && anthropicOk) {
preferred = ANTHROPIC_PDF_PRIMARY;
} else if (primary.provider === "google" && googleOk && providerVision) {
if (primary.provider === "google" && googleOk && providerVision) {
preferred = providerVision;
} else if (providerOk && providerVision) {
preferred = providerVision;
} else if (anthropicOk) {
preferred = ANTHROPIC_PDF_PRIMARY;
} else if (googleOk) {
preferred = "google/gemini-2.5-pro";
} else if (openaiOk) {
preferred = "openai/gpt-5-mini";
} else if (providerOk && (providerVision || providerDefault)) {
preferred = providerVision ?? `${primary.provider}/${providerDefault}`;
} else {
preferred = nativePdfCandidates[0] ?? genericImageCandidates[0] ?? null;
}
if (preferred?.trim()) {
if (anthropicOk && preferred !== ANTHROPIC_PDF_PRIMARY) {
addFallback(ANTHROPIC_PDF_PRIMARY);
}
if (anthropicOk) {
addFallback(ANTHROPIC_PDF_FALLBACK);
}
if (openaiOk) {
addFallback("openai/gpt-5-mini");
for (const candidate of [...nativePdfCandidates, ...genericImageCandidates]) {
if (candidate !== preferred) {
addFallback(candidate);
}
}
const pruned = fallbacks.filter((ref) => ref !== preferred);
return { primary: preferred, ...(pruned.length > 0 ? { fallbacks: pruned } : {}) };

View File

@ -178,7 +178,7 @@ describe("config schema regressions", () => {
defaults: {
pdfModel: {
primary: "anthropic/claude-opus-4-6",
fallbacks: ["openai/gpt-5-mini"],
fallbacks: ["openai/gpt-5.4-mini"],
},
pdfMaxBytesMb: 12,
pdfMaxPages: 25,
@ -193,7 +193,7 @@ describe("config schema regressions", () => {
const res = validateConfigObject({
agents: {
defaults: {
pdfModel: { primary: "openai/gpt-5-mini" },
pdfModel: { primary: "openai/gpt-5.4-mini" },
pdfMaxBytesMb: 0,
pdfMaxPages: 0,
},

View File

@ -25,7 +25,8 @@ const DEFAULT_MODEL_ALIASES: Readonly<Record<string, string>> = {
// OpenAI
gpt: "openai/gpt-5.4",
"gpt-mini": "openai/gpt-5-mini",
"gpt-mini": "openai/gpt-5.4-mini",
"gpt-nano": "openai/gpt-5.4-nano",
// Google Gemini (3.x are preview ids in the catalog)
gemini: "google/gemini-3.1-pro-preview",

View File

@ -1,44 +1,64 @@
import { describe, expect, it } from "vitest";
import {
AUTO_AUDIO_KEY_PROVIDERS,
AUTO_IMAGE_KEY_PROVIDERS,
AUTO_VIDEO_KEY_PROVIDERS,
DEFAULT_AUDIO_MODELS,
DEFAULT_IMAGE_MODELS,
providerSupportsNativePdfDocument,
resolveAutoMediaKeyProviders,
resolveDefaultMediaModel,
} from "./defaults.js";
describe("DEFAULT_AUDIO_MODELS", () => {
it("includes Mistral Voxtral default", () => {
expect(DEFAULT_AUDIO_MODELS.mistral).toBe("voxtral-mini-latest");
describe("resolveDefaultMediaModel", () => {
it("resolves bundled audio defaults from provider metadata", () => {
expect(resolveDefaultMediaModel({ providerId: "mistral", capability: "audio" })).toBe(
"voxtral-mini-latest",
);
});
it("resolves bundled image defaults beyond the historical core set", () => {
expect(resolveDefaultMediaModel({ providerId: "minimax-portal", capability: "image" })).toBe(
"MiniMax-VL-01",
);
expect(resolveDefaultMediaModel({ providerId: "openai-codex", capability: "image" })).toBe(
"gpt-5.4",
);
expect(resolveDefaultMediaModel({ providerId: "moonshot", capability: "image" })).toBe(
"kimi-k2.5",
);
expect(resolveDefaultMediaModel({ providerId: "openrouter", capability: "image" })).toBe(
"auto",
);
});
});
describe("AUTO_AUDIO_KEY_PROVIDERS", () => {
it("includes mistral auto key resolution", () => {
expect(AUTO_AUDIO_KEY_PROVIDERS).toContain("mistral");
describe("resolveAutoMediaKeyProviders", () => {
it("keeps the bundled audio fallback order", () => {
expect(resolveAutoMediaKeyProviders({ capability: "audio" })).toEqual([
"openai",
"groq",
"deepgram",
"google",
"mistral",
]);
});
it("keeps the bundled image fallback order", () => {
expect(resolveAutoMediaKeyProviders({ capability: "image" })).toEqual([
"openai",
"anthropic",
"google",
"minimax",
"minimax-portal",
"zai",
]);
});
it("keeps the bundled video fallback order", () => {
expect(resolveAutoMediaKeyProviders({ capability: "video" })).toEqual(["google", "moonshot"]);
});
});
describe("AUTO_VIDEO_KEY_PROVIDERS", () => {
it("includes moonshot auto key resolution", () => {
expect(AUTO_VIDEO_KEY_PROVIDERS).toContain("moonshot");
});
});
describe("AUTO_IMAGE_KEY_PROVIDERS", () => {
it("includes minimax-portal auto key resolution", () => {
expect(AUTO_IMAGE_KEY_PROVIDERS).toContain("minimax-portal");
});
});
describe("DEFAULT_IMAGE_MODELS", () => {
it("includes the MiniMax portal vision default", () => {
expect(DEFAULT_IMAGE_MODELS["minimax-portal"]).toBe("MiniMax-VL-01");
});
it("includes bundled image-provider defaults beyond the core provider set", () => {
expect(DEFAULT_IMAGE_MODELS["openai-codex"]).toBe("gpt-5.4");
expect(DEFAULT_IMAGE_MODELS.moonshot).toBe("kimi-k2.5");
expect(DEFAULT_IMAGE_MODELS.openrouter).toBe("auto");
describe("providerSupportsNativePdfDocument", () => {
it("reads native PDF support from provider metadata", () => {
expect(providerSupportsNativePdfDocument({ providerId: "anthropic" })).toBe(true);
expect(providerSupportsNativePdfDocument({ providerId: "google" })).toBe(true);
expect(providerSupportsNativePdfDocument({ providerId: "openai" })).toBe(false);
});
});

View File

@ -1,4 +1,6 @@
import type { MediaUnderstandingCapability } from "./types.js";
import type { OpenClawConfig } from "../config/config.js";
import { buildMediaUnderstandingRegistry, normalizeMediaProviderId } from "./provider-registry.js";
import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js";
const MB = 1024 * 1024;
@ -27,43 +29,79 @@ export const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string> = {
video: "Describe the video.",
};
export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
groq: "whisper-large-v3-turbo",
openai: "gpt-4o-mini-transcribe",
deepgram: "nova-3",
mistral: "voxtral-mini-latest",
};
export const AUTO_AUDIO_KEY_PROVIDERS = [
"openai",
"groq",
"deepgram",
"google",
"mistral",
] as const;
export const AUTO_IMAGE_KEY_PROVIDERS = [
"openai",
"anthropic",
"google",
"minimax",
"minimax-portal",
"zai",
] as const;
export const AUTO_VIDEO_KEY_PROVIDERS = ["google", "moonshot"] as const;
export const DEFAULT_IMAGE_MODELS: Record<string, string> = {
openai: "gpt-5-mini",
"openai-codex": "gpt-5.4",
anthropic: "claude-opus-4-6",
google: "gemini-3-flash-preview",
minimax: "MiniMax-VL-01",
"minimax-portal": "MiniMax-VL-01",
moonshot: "kimi-k2.5",
openrouter: "auto",
zai: "glm-4.6v",
};
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
export const DEFAULT_MEDIA_CONCURRENCY = 2;
function providerSupportsCapability(
provider: MediaUnderstandingProvider | undefined,
capability: MediaUnderstandingCapability,
): boolean {
if (!provider) {
return false;
}
if (capability === "audio") {
return Boolean(provider.transcribeAudio);
}
if (capability === "image") {
return Boolean(provider.describeImage);
}
return Boolean(provider.describeVideo);
}
function resolveDefaultRegistry(cfg?: OpenClawConfig) {
return buildMediaUnderstandingRegistry(undefined, cfg ?? ({} as OpenClawConfig));
}
export function resolveDefaultMediaModel(params: {
providerId: string;
capability: MediaUnderstandingCapability;
cfg?: OpenClawConfig;
providerRegistry?: Map<string, MediaUnderstandingProvider>;
}): string | undefined {
const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg);
const provider = registry.get(normalizeMediaProviderId(params.providerId));
return provider?.defaultModels?.[params.capability]?.trim() || undefined;
}
export function resolveAutoMediaKeyProviders(params: {
capability: MediaUnderstandingCapability;
cfg?: OpenClawConfig;
providerRegistry?: Map<string, MediaUnderstandingProvider>;
}): string[] {
const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg);
type AutoProviderEntry = {
provider: MediaUnderstandingProvider;
priority: number;
};
return [...registry.values()]
.filter((provider) => providerSupportsCapability(provider, params.capability))
.map((provider): AutoProviderEntry | null => {
const priority = provider.autoPriority?.[params.capability];
return typeof priority === "number" && Number.isFinite(priority)
? { provider, priority }
: null;
})
.filter((entry): entry is AutoProviderEntry => entry !== null)
.toSorted((left, right) => {
if (left.priority !== right.priority) {
return left.priority - right.priority;
}
return left.provider.id.localeCompare(right.provider.id);
})
.map((entry) => normalizeMediaProviderId(entry.provider.id))
.filter(Boolean);
}
export function providerSupportsNativePdfDocument(params: {
providerId: string;
cfg?: OpenClawConfig;
providerRegistry?: Map<string, MediaUnderstandingProvider>;
}): boolean {
const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg);
const provider = registry.get(normalizeMediaProviderId(params.providerId));
return provider?.nativeDocumentInputs?.includes("pdf") ?? false;
}
/**
* Minimum audio file size in bytes below which transcription is skipped.
* Files smaller than this threshold are almost certainly empty or corrupt

View File

@ -14,6 +14,9 @@ function mergeProviderIntoRegistry(
...existing,
...provider,
capabilities: provider.capabilities ?? existing.capabilities,
defaultModels: provider.defaultModels ?? existing.defaultModels,
autoPriority: provider.autoPriority ?? existing.autoPriority,
nativeDocumentInputs: provider.nativeDocumentInputs ?? existing.nativeDocumentInputs,
}
: provider;
registry.set(normalizedKey, merged);
@ -41,6 +44,9 @@ export function buildMediaUnderstandingRegistry(
...existing,
...provider,
capabilities: provider.capabilities ?? existing.capabilities,
defaultModels: provider.defaultModels ?? existing.defaultModels,
autoPriority: provider.autoPriority ?? existing.autoPriority,
nativeDocumentInputs: provider.nativeDocumentInputs ?? existing.nativeDocumentInputs,
}
: provider;
registry.set(normalizedKey, merged);

View File

@ -24,9 +24,9 @@ import { runExec } from "../process/exec.js";
import { MediaAttachmentCache } from "./attachments.js";
import {
CLI_OUTPUT_MAX_BUFFER,
DEFAULT_AUDIO_MODELS,
DEFAULT_TIMEOUT_SECONDS,
MIN_AUDIO_FILE_BYTES,
resolveDefaultMediaModel,
} from "./defaults.js";
import { MediaUnderstandingSkipError } from "./errors.js";
import { fileExists } from "./fs.js";
@ -548,7 +548,14 @@ export async function runProviderEntry(params: {
config: params.config,
entry,
});
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
const model =
entry.model?.trim() ||
resolveDefaultMediaModel({
cfg,
providerId,
capability: "audio",
}) ||
entry.model;
const result = await executeWithApiKeyRotation({
provider: providerId,
apiKeys,

View File

@ -25,12 +25,7 @@ import { mergeInboundPathRoots } from "../media/inbound-path-policy.js";
import { getDefaultMediaLocalRoots } from "../media/local-roots.js";
import { runExec } from "../process/exec.js";
import { MediaAttachmentCache, selectAttachments } from "./attachments.js";
import {
AUTO_AUDIO_KEY_PROVIDERS,
AUTO_IMAGE_KEY_PROVIDERS,
AUTO_VIDEO_KEY_PROVIDERS,
DEFAULT_IMAGE_MODELS,
} from "./defaults.js";
import { resolveAutoMediaKeyProviders, resolveDefaultMediaModel } from "./defaults.js";
import { isMediaUnderstandingSkipError } from "./errors.js";
import { fileExists } from "./fs.js";
import { extractGeminiResponse } from "./output-extract.js";
@ -152,7 +147,11 @@ async function resolveAutoImageModelId(params: {
if (configuredModel) {
return configuredModel;
}
const defaultModel = DEFAULT_IMAGE_MODELS[params.providerId];
const defaultModel = resolveDefaultMediaModel({
cfg: params.cfg,
providerId: params.providerId,
capability: "image",
});
if (defaultModel) {
return defaultModel;
}
@ -470,7 +469,11 @@ async function resolveKeyEntry(params: {
cfg,
providerRegistry,
capability,
fallbackProviders: AUTO_IMAGE_KEY_PROVIDERS,
fallbackProviders: resolveAutoMediaKeyProviders({
cfg,
capability,
providerRegistry,
}),
})) {
const entry = await checkProvider(providerId);
if (entry) {
@ -492,7 +495,11 @@ async function resolveKeyEntry(params: {
cfg,
providerRegistry,
capability,
fallbackProviders: AUTO_VIDEO_KEY_PROVIDERS,
fallbackProviders: resolveAutoMediaKeyProviders({
cfg,
capability,
providerRegistry,
}),
})) {
const entry = await checkProvider(providerId, undefined);
if (entry) {
@ -513,7 +520,11 @@ async function resolveKeyEntry(params: {
cfg,
providerRegistry,
capability,
fallbackProviders: AUTO_AUDIO_KEY_PROVIDERS,
fallbackProviders: resolveAutoMediaKeyProviders({
cfg,
capability,
providerRegistry,
}),
})) {
const entry = await checkProvider(providerId, undefined);
if (entry) {

View File

@ -137,6 +137,9 @@ export type ImagesDescriptionResult = {
export type MediaUnderstandingProvider = {
id: string;
capabilities?: MediaUnderstandingCapability[];
defaultModels?: Partial<Record<MediaUnderstandingCapability, string>>;
autoPriority?: Partial<Record<MediaUnderstandingCapability, number>>;
nativeDocumentInputs?: Array<"pdf">;
transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
describeImage?: (req: ImageDescriptionRequest) => Promise<ImageDescriptionResult>;