mirror of https://github.com/openclaw/openclaw.git
refactor(media): move provider defaults into media metadata
This commit is contained in:
parent
fca80d2ee2
commit
3a3f88a80a
|
|
@ -7,6 +7,9 @@ import {
|
|||
export const anthropicMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "anthropic",
|
||||
capabilities: ["image"],
|
||||
defaultModels: { image: "claude-opus-4-6" },
|
||||
autoPriority: { image: 20 },
|
||||
nativeDocumentInputs: ["pdf"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -4,5 +4,7 @@ import { transcribeDeepgramAudio } from "./audio.js";
|
|||
export const deepgramMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "deepgram",
|
||||
capabilities: ["audio"],
|
||||
defaultModels: { audio: "nova-3" },
|
||||
autoPriority: { audio: 30 },
|
||||
transcribeAudio: transcribeDeepgramAudio,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -155,6 +155,13 @@ export async function describeGeminiVideo(
|
|||
export const googleMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "google",
|
||||
capabilities: ["image", "audio", "video"],
|
||||
defaultModels: {
|
||||
image: DEFAULT_GOOGLE_VIDEO_MODEL,
|
||||
audio: DEFAULT_GOOGLE_AUDIO_MODEL,
|
||||
video: DEFAULT_GOOGLE_VIDEO_MODEL,
|
||||
},
|
||||
autoPriority: { image: 30, audio: 40, video: 10 },
|
||||
nativeDocumentInputs: ["pdf"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
transcribeAudio: transcribeGeminiAudio,
|
||||
|
|
|
|||
|
|
@ -9,6 +9,8 @@ const DEFAULT_GROQ_AUDIO_MODEL = "whisper-large-v3-turbo";
|
|||
export const groqMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "groq",
|
||||
capabilities: ["audio"],
|
||||
defaultModels: { audio: DEFAULT_GROQ_AUDIO_MODEL },
|
||||
autoPriority: { audio: 20 },
|
||||
transcribeAudio: (req) =>
|
||||
transcribeOpenAiCompatibleAudio({
|
||||
...req,
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ import {
|
|||
export const minimaxMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "minimax",
|
||||
capabilities: ["image"],
|
||||
defaultModels: { image: "MiniMax-VL-01" },
|
||||
autoPriority: { image: 40 },
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
};
|
||||
|
|
@ -14,6 +16,8 @@ export const minimaxMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
|||
export const minimaxPortalMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "minimax-portal",
|
||||
capabilities: ["image"],
|
||||
defaultModels: { image: "MiniMax-VL-01" },
|
||||
autoPriority: { image: 50 },
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -9,6 +9,8 @@ const DEFAULT_MISTRAL_AUDIO_MODEL = "voxtral-mini-latest";
|
|||
export const mistralMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "mistral",
|
||||
capabilities: ["audio"],
|
||||
defaultModels: { audio: DEFAULT_MISTRAL_AUDIO_MODEL },
|
||||
autoPriority: { audio: 50 },
|
||||
transcribeAudio: async (req) =>
|
||||
await transcribeOpenAiCompatibleAudio({
|
||||
...req,
|
||||
|
|
|
|||
|
|
@ -126,6 +126,8 @@ export async function describeMoonshotVideo(
|
|||
export const moonshotMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "moonshot",
|
||||
capabilities: ["image", "video"],
|
||||
defaultModels: { image: "kimi-k2.5", video: DEFAULT_MOONSHOT_VIDEO_MODEL },
|
||||
autoPriority: { video: 20 },
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
describeVideo: describeMoonshotVideo,
|
||||
|
|
|
|||
|
|
@ -21,6 +21,11 @@ export async function transcribeOpenAiAudio(params: AudioTranscriptionRequest) {
|
|||
export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "openai",
|
||||
capabilities: ["image", "audio"],
|
||||
defaultModels: {
|
||||
image: "gpt-5.4-mini",
|
||||
audio: OPENAI_DEFAULT_AUDIO_TRANSCRIPTION_MODEL,
|
||||
},
|
||||
autoPriority: { image: 10, audio: 10 },
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
transcribeAudio: transcribeOpenAiAudio,
|
||||
|
|
@ -29,6 +34,7 @@ export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
|||
export const openaiCodexMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "openai-codex",
|
||||
capabilities: ["image"],
|
||||
defaultModels: { image: "gpt-5.4" },
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import {
|
|||
export const openrouterMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "openrouter",
|
||||
capabilities: ["image"],
|
||||
defaultModels: { image: "auto" },
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -11,9 +11,9 @@ vi.mock("openclaw/plugin-sdk/agent-runtime", () => ({
|
|||
}));
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
|
||||
AUTO_IMAGE_KEY_PROVIDERS: ["openai"],
|
||||
DEFAULT_IMAGE_MODELS: { openai: "gpt-4.1-mini" },
|
||||
resolveAutoImageModel: vi.fn(async () => null),
|
||||
resolveAutoMediaKeyProviders: vi.fn(() => ["openai"]),
|
||||
resolveDefaultMediaModel: vi.fn(() => "gpt-4.1-mini"),
|
||||
}));
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/media-understanding-runtime", () => ({
|
||||
|
|
|
|||
|
|
@ -9,8 +9,11 @@ import {
|
|||
import { resolveDefaultModelForAgent } from "openclaw/plugin-sdk/agent-runtime";
|
||||
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
|
||||
import { loadJsonFile, saveJsonFile } from "openclaw/plugin-sdk/json-store";
|
||||
import { AUTO_IMAGE_KEY_PROVIDERS, DEFAULT_IMAGE_MODELS } from "openclaw/plugin-sdk/media-runtime";
|
||||
import { resolveAutoImageModel } from "openclaw/plugin-sdk/media-runtime";
|
||||
import {
|
||||
resolveAutoMediaKeyProviders,
|
||||
resolveDefaultMediaModel,
|
||||
} from "openclaw/plugin-sdk/media-runtime";
|
||||
import { describeImageFileWithModel } from "openclaw/plugin-sdk/media-understanding-runtime";
|
||||
import { logVerbose } from "openclaw/plugin-sdk/runtime-env";
|
||||
import { STATE_DIR } from "openclaw/plugin-sdk/state-paths";
|
||||
|
|
@ -182,6 +185,11 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
|
|||
}
|
||||
};
|
||||
|
||||
const autoProviders = resolveAutoMediaKeyProviders({
|
||||
cfg,
|
||||
capability: "image",
|
||||
});
|
||||
|
||||
const selectCatalogModel = (provider: string) => {
|
||||
const entries = catalog.filter(
|
||||
(entry) =>
|
||||
|
|
@ -190,7 +198,11 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
|
|||
if (entries.length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
const defaultId = DEFAULT_IMAGE_MODELS[provider];
|
||||
const defaultId = resolveDefaultMediaModel({
|
||||
cfg,
|
||||
providerId: provider,
|
||||
capability: "image",
|
||||
});
|
||||
const preferred = entries.find((entry) => entry.id === defaultId);
|
||||
return preferred ?? entries[0];
|
||||
};
|
||||
|
|
@ -198,16 +210,14 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
|
|||
let resolved = null as { provider: string; model?: string } | null;
|
||||
if (
|
||||
activeModel &&
|
||||
AUTO_IMAGE_KEY_PROVIDERS.includes(
|
||||
activeModel.provider as (typeof AUTO_IMAGE_KEY_PROVIDERS)[number],
|
||||
) &&
|
||||
autoProviders.includes(activeModel.provider) &&
|
||||
(await hasProviderKey(activeModel.provider))
|
||||
) {
|
||||
resolved = activeModel;
|
||||
}
|
||||
|
||||
if (!resolved) {
|
||||
for (const provider of AUTO_IMAGE_KEY_PROVIDERS) {
|
||||
for (const provider of autoProviders) {
|
||||
if (!(await hasProviderKey(provider))) {
|
||||
continue;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ import {
|
|||
export const zaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
|
||||
id: "zai",
|
||||
capabilities: ["image"],
|
||||
defaultModels: { image: "glm-4.6v" },
|
||||
autoPriority: { image: 60 },
|
||||
describeImage: describeImageWithModel,
|
||||
describeImages: describeImagesWithModel,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -126,11 +126,12 @@ function resolveImageGenerationModelCandidates(
|
|||
providerDefaults.set(providerId, `${providerId}/${modelId}`);
|
||||
}
|
||||
|
||||
const primaryProvider = resolveDefaultModelRef(cfg).provider;
|
||||
const orderedProviders = [
|
||||
resolveDefaultModelRef(cfg).provider,
|
||||
"openai",
|
||||
"google",
|
||||
...providerDefaults.keys(),
|
||||
primaryProvider,
|
||||
...[...providerDefaults.keys()]
|
||||
.filter((providerId) => providerId !== primaryProvider)
|
||||
.toSorted(),
|
||||
];
|
||||
const orderedRefs: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
|
|
|||
|
|
@ -274,7 +274,7 @@ function createMinimaxImageConfig(): OpenClawConfig {
|
|||
function createDefaultImageFallbackExpectation(primary: string) {
|
||||
return {
|
||||
primary,
|
||||
fallbacks: ["openai/gpt-5-mini", "anthropic/claude-opus-4-5"],
|
||||
fallbacks: ["openai/gpt-5.4-mini", "anthropic/claude-opus-4-6"],
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -618,12 +618,12 @@ describe("image tool implicit imageModel config", () => {
|
|||
agents: {
|
||||
defaults: {
|
||||
model: { primary: "minimax/MiniMax-M2.7" },
|
||||
imageModel: { primary: "openai/gpt-5-mini" },
|
||||
imageModel: { primary: "openai/gpt-5.4-mini" },
|
||||
},
|
||||
},
|
||||
};
|
||||
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
|
||||
primary: "openai/gpt-5-mini",
|
||||
primary: "openai/gpt-5.4-mini",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -638,7 +638,7 @@ describe("image tool implicit imageModel config", () => {
|
|||
agents: {
|
||||
defaults: {
|
||||
model: { primary: "acme/vision-1" },
|
||||
imageModel: { primary: "openai/gpt-5-mini" },
|
||||
imageModel: { primary: "openai/gpt-5.4-mini" },
|
||||
},
|
||||
},
|
||||
models: {
|
||||
|
|
@ -652,7 +652,7 @@ describe("image tool implicit imageModel config", () => {
|
|||
};
|
||||
// Tool should still be available for explicit image analysis requests
|
||||
expect(resolveImageModelConfigForTool({ cfg, agentDir })).toEqual({
|
||||
primary: "openai/gpt-5-mini",
|
||||
primary: "openai/gpt-5.4-mini",
|
||||
});
|
||||
const tool = createImageTool({ config: cfg, agentDir, modelHasVision: true });
|
||||
expect(tool).not.toBeNull();
|
||||
|
|
@ -1229,7 +1229,7 @@ describe("image tool response validation", () => {
|
|||
role: "assistant",
|
||||
api: "openai-responses",
|
||||
provider: "openai",
|
||||
model: "gpt-5-mini",
|
||||
model: "gpt-5.4-mini",
|
||||
stopReason: "stop",
|
||||
timestamp: Date.now(),
|
||||
usage: makeZeroUsageSnapshot(),
|
||||
|
|
@ -1278,7 +1278,7 @@ describe("image tool response validation", () => {
|
|||
expect(() =>
|
||||
__testing.coerceImageAssistantText({
|
||||
provider: "openai",
|
||||
model: "gpt-5-mini",
|
||||
model: "gpt-5.4-mini",
|
||||
message,
|
||||
}),
|
||||
).toThrow(expectedError);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
import { resolve, isAbsolute } from "node:path";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import type { OpenClawConfig } from "../../config/config.js";
|
||||
import {
|
||||
resolveAutoMediaKeyProviders,
|
||||
resolveDefaultMediaModel,
|
||||
} from "../../media-understanding/defaults.js";
|
||||
import { getMediaUnderstandingProvider } from "../../media-understanding/provider-registry.js";
|
||||
import { buildProviderRegistry } from "../../media-understanding/runner.js";
|
||||
import { loadWebMedia } from "../../media/web-media.js";
|
||||
|
|
@ -40,8 +44,6 @@ import {
|
|||
} from "./tool-runtime.helpers.js";
|
||||
|
||||
const DEFAULT_PROMPT = "Describe the image.";
|
||||
const ANTHROPIC_IMAGE_PRIMARY = "anthropic/claude-opus-4-6";
|
||||
const ANTHROPIC_IMAGE_FALLBACK = "anthropic/claude-opus-4-5";
|
||||
const DEFAULT_MAX_IMAGES = 20;
|
||||
|
||||
const imageToolProviderDeps = {
|
||||
|
|
@ -103,28 +105,39 @@ export function resolveImageModelConfigForTool(params: {
|
|||
provider: primary.provider,
|
||||
});
|
||||
const primaryCandidates = (() => {
|
||||
if (isMinimaxVlmProvider(primary.provider)) {
|
||||
return [`${primary.provider}/MiniMax-VL-01`];
|
||||
}
|
||||
if (providerVisionFromConfig) {
|
||||
return [providerVisionFromConfig];
|
||||
}
|
||||
if (primary.provider === "zai") {
|
||||
return ["zai/glm-4.6v"];
|
||||
const providerDefault = resolveDefaultMediaModel({
|
||||
cfg: params.cfg,
|
||||
providerId: primary.provider,
|
||||
capability: "image",
|
||||
});
|
||||
if (providerDefault) {
|
||||
return [`${primary.provider}/${providerDefault}`];
|
||||
}
|
||||
if (primary.provider === "openai") {
|
||||
return ["openai/gpt-5-mini"];
|
||||
}
|
||||
if (primary.provider === "anthropic") {
|
||||
return [ANTHROPIC_IMAGE_PRIMARY];
|
||||
if (isMinimaxVlmProvider(primary.provider)) {
|
||||
return [`${primary.provider}/MiniMax-VL-01`];
|
||||
}
|
||||
return [];
|
||||
})();
|
||||
|
||||
const autoCandidates = resolveAutoMediaKeyProviders({
|
||||
cfg: params.cfg,
|
||||
capability: "image",
|
||||
}).map((providerId) => {
|
||||
const modelId = resolveDefaultMediaModel({
|
||||
cfg: params.cfg,
|
||||
providerId,
|
||||
capability: "image",
|
||||
});
|
||||
return modelId ? `${providerId}/${modelId}` : null;
|
||||
});
|
||||
|
||||
return buildToolModelConfigFromCandidates({
|
||||
explicit,
|
||||
agentDir: params.agentDir,
|
||||
candidates: [...primaryCandidates, "openai/gpt-5-mini", ANTHROPIC_IMAGE_FALLBACK],
|
||||
candidates: [...primaryCandidates, ...autoCandidates],
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4,22 +4,16 @@ import {
|
|||
resolveAgentModelFallbackValues,
|
||||
resolveAgentModelPrimaryValue,
|
||||
} from "../../config/model-input.js";
|
||||
import { providerSupportsNativePdfDocument } from "../../media-understanding/defaults.js";
|
||||
import { extractAssistantText } from "../pi-embedded-utils.js";
|
||||
|
||||
export type PdfModelConfig = { primary?: string; fallbacks?: string[] };
|
||||
|
||||
/**
|
||||
* Providers known to support native PDF document input.
|
||||
* When the model's provider is in this set, the tool sends raw PDF bytes
|
||||
* via provider-specific API calls instead of extracting text/images first.
|
||||
*/
|
||||
export const NATIVE_PDF_PROVIDERS = new Set(["anthropic", "google"]);
|
||||
|
||||
/**
|
||||
* Check whether a provider supports native PDF document input.
|
||||
*/
|
||||
export function providerSupportsNativePdf(provider: string): boolean {
|
||||
return NATIVE_PDF_PROVIDERS.has(provider.toLowerCase().trim());
|
||||
return providerSupportsNativePdfDocument({ providerId: provider });
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ async function withTempAgentDir<T>(run: (agentDir: string) => Promise<T>): Promi
|
|||
}
|
||||
|
||||
const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6";
|
||||
const OPENAI_PDF_MODEL = "openai/gpt-5-mini";
|
||||
const OPENAI_PDF_MODEL = "openai/gpt-5.4-mini";
|
||||
const TEST_PDF_INPUT = { base64: "dGVzdA==", filename: "doc.pdf" } as const;
|
||||
const FAKE_PDF_MEDIA = {
|
||||
kind: "document",
|
||||
|
|
@ -295,12 +295,12 @@ describe("resolvePdfModelConfigForTool", () => {
|
|||
agents: {
|
||||
defaults: {
|
||||
model: { primary: "openai/gpt-5.4" },
|
||||
imageModel: { primary: "openai/gpt-5-mini" },
|
||||
imageModel: { primary: "openai/gpt-5.4-mini" },
|
||||
},
|
||||
},
|
||||
};
|
||||
expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({
|
||||
primary: "openai/gpt-5-mini",
|
||||
primary: "openai/gpt-5.4-mini",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,6 +1,11 @@
|
|||
import { type Context, complete } from "@mariozechner/pi-ai";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import type { OpenClawConfig } from "../../config/config.js";
|
||||
import {
|
||||
providerSupportsNativePdfDocument,
|
||||
resolveAutoMediaKeyProviders,
|
||||
resolveDefaultMediaModel,
|
||||
} from "../../media-understanding/defaults.js";
|
||||
import { extractPdfContent, type PdfExtractedContent } from "../../media/pdf-extract.js";
|
||||
import { loadWebMediaRaw } from "../../media/web-media.js";
|
||||
import { resolveUserPath } from "../../utils.js";
|
||||
|
|
@ -43,8 +48,6 @@ const DEFAULT_PROMPT = "Analyze this PDF document.";
|
|||
const DEFAULT_MAX_PDFS = 10;
|
||||
const DEFAULT_MAX_BYTES_MB = 10;
|
||||
const DEFAULT_MAX_PAGES = 20;
|
||||
const ANTHROPIC_PDF_PRIMARY = "anthropic/claude-opus-4-6";
|
||||
const ANTHROPIC_PDF_FALLBACK = "anthropic/claude-opus-4-5";
|
||||
|
||||
const PDF_MIN_TEXT_CHARS = 200;
|
||||
const PDF_MAX_PIXELS = 4_000_000;
|
||||
|
|
@ -75,9 +78,7 @@ export function resolvePdfModelConfigForTool(params: {
|
|||
|
||||
// Auto-detect from available providers
|
||||
const primary = resolveDefaultModelRef(params.cfg);
|
||||
const anthropicOk = hasAuthForProvider({ provider: "anthropic", agentDir: params.agentDir });
|
||||
const googleOk = hasAuthForProvider({ provider: "google", agentDir: params.agentDir });
|
||||
const openaiOk = hasAuthForProvider({ provider: "openai", agentDir: params.agentDir });
|
||||
|
||||
const fallbacks: string[] = [];
|
||||
const addFallback = (ref: string) => {
|
||||
|
|
@ -95,30 +96,54 @@ export function resolvePdfModelConfigForTool(params: {
|
|||
cfg: params.cfg,
|
||||
provider: primary.provider,
|
||||
});
|
||||
const providerDefault = resolveDefaultMediaModel({
|
||||
cfg: params.cfg,
|
||||
providerId: primary.provider,
|
||||
capability: "image",
|
||||
});
|
||||
const nativePdfCandidates = resolveAutoMediaKeyProviders({
|
||||
cfg: params.cfg,
|
||||
capability: "image",
|
||||
})
|
||||
.filter((providerId) => providerSupportsNativePdfDocument({ cfg: params.cfg, providerId }))
|
||||
.filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir }))
|
||||
.map((providerId) => {
|
||||
const modelId = resolveDefaultMediaModel({
|
||||
cfg: params.cfg,
|
||||
providerId,
|
||||
capability: "image",
|
||||
});
|
||||
return modelId ? `${providerId}/${modelId}` : null;
|
||||
})
|
||||
.filter((value): value is string => Boolean(value));
|
||||
const genericImageCandidates = resolveAutoMediaKeyProviders({
|
||||
cfg: params.cfg,
|
||||
capability: "image",
|
||||
})
|
||||
.filter((providerId) => hasAuthForProvider({ provider: providerId, agentDir: params.agentDir }))
|
||||
.map((providerId) => {
|
||||
const modelId = resolveDefaultMediaModel({
|
||||
cfg: params.cfg,
|
||||
providerId,
|
||||
capability: "image",
|
||||
});
|
||||
return modelId ? `${providerId}/${modelId}` : null;
|
||||
})
|
||||
.filter((value): value is string => Boolean(value));
|
||||
|
||||
if (primary.provider === "anthropic" && anthropicOk) {
|
||||
preferred = ANTHROPIC_PDF_PRIMARY;
|
||||
} else if (primary.provider === "google" && googleOk && providerVision) {
|
||||
if (primary.provider === "google" && googleOk && providerVision) {
|
||||
preferred = providerVision;
|
||||
} else if (providerOk && providerVision) {
|
||||
preferred = providerVision;
|
||||
} else if (anthropicOk) {
|
||||
preferred = ANTHROPIC_PDF_PRIMARY;
|
||||
} else if (googleOk) {
|
||||
preferred = "google/gemini-2.5-pro";
|
||||
} else if (openaiOk) {
|
||||
preferred = "openai/gpt-5-mini";
|
||||
} else if (providerOk && (providerVision || providerDefault)) {
|
||||
preferred = providerVision ?? `${primary.provider}/${providerDefault}`;
|
||||
} else {
|
||||
preferred = nativePdfCandidates[0] ?? genericImageCandidates[0] ?? null;
|
||||
}
|
||||
|
||||
if (preferred?.trim()) {
|
||||
if (anthropicOk && preferred !== ANTHROPIC_PDF_PRIMARY) {
|
||||
addFallback(ANTHROPIC_PDF_PRIMARY);
|
||||
}
|
||||
if (anthropicOk) {
|
||||
addFallback(ANTHROPIC_PDF_FALLBACK);
|
||||
}
|
||||
if (openaiOk) {
|
||||
addFallback("openai/gpt-5-mini");
|
||||
for (const candidate of [...nativePdfCandidates, ...genericImageCandidates]) {
|
||||
if (candidate !== preferred) {
|
||||
addFallback(candidate);
|
||||
}
|
||||
}
|
||||
const pruned = fallbacks.filter((ref) => ref !== preferred);
|
||||
return { primary: preferred, ...(pruned.length > 0 ? { fallbacks: pruned } : {}) };
|
||||
|
|
|
|||
|
|
@ -178,7 +178,7 @@ describe("config schema regressions", () => {
|
|||
defaults: {
|
||||
pdfModel: {
|
||||
primary: "anthropic/claude-opus-4-6",
|
||||
fallbacks: ["openai/gpt-5-mini"],
|
||||
fallbacks: ["openai/gpt-5.4-mini"],
|
||||
},
|
||||
pdfMaxBytesMb: 12,
|
||||
pdfMaxPages: 25,
|
||||
|
|
@ -193,7 +193,7 @@ describe("config schema regressions", () => {
|
|||
const res = validateConfigObject({
|
||||
agents: {
|
||||
defaults: {
|
||||
pdfModel: { primary: "openai/gpt-5-mini" },
|
||||
pdfModel: { primary: "openai/gpt-5.4-mini" },
|
||||
pdfMaxBytesMb: 0,
|
||||
pdfMaxPages: 0,
|
||||
},
|
||||
|
|
|
|||
|
|
@ -25,7 +25,8 @@ const DEFAULT_MODEL_ALIASES: Readonly<Record<string, string>> = {
|
|||
|
||||
// OpenAI
|
||||
gpt: "openai/gpt-5.4",
|
||||
"gpt-mini": "openai/gpt-5-mini",
|
||||
"gpt-mini": "openai/gpt-5.4-mini",
|
||||
"gpt-nano": "openai/gpt-5.4-nano",
|
||||
|
||||
// Google Gemini (3.x are preview ids in the catalog)
|
||||
gemini: "google/gemini-3.1-pro-preview",
|
||||
|
|
|
|||
|
|
@ -1,44 +1,64 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
AUTO_AUDIO_KEY_PROVIDERS,
|
||||
AUTO_IMAGE_KEY_PROVIDERS,
|
||||
AUTO_VIDEO_KEY_PROVIDERS,
|
||||
DEFAULT_AUDIO_MODELS,
|
||||
DEFAULT_IMAGE_MODELS,
|
||||
providerSupportsNativePdfDocument,
|
||||
resolveAutoMediaKeyProviders,
|
||||
resolveDefaultMediaModel,
|
||||
} from "./defaults.js";
|
||||
|
||||
describe("DEFAULT_AUDIO_MODELS", () => {
|
||||
it("includes Mistral Voxtral default", () => {
|
||||
expect(DEFAULT_AUDIO_MODELS.mistral).toBe("voxtral-mini-latest");
|
||||
describe("resolveDefaultMediaModel", () => {
|
||||
it("resolves bundled audio defaults from provider metadata", () => {
|
||||
expect(resolveDefaultMediaModel({ providerId: "mistral", capability: "audio" })).toBe(
|
||||
"voxtral-mini-latest",
|
||||
);
|
||||
});
|
||||
|
||||
it("resolves bundled image defaults beyond the historical core set", () => {
|
||||
expect(resolveDefaultMediaModel({ providerId: "minimax-portal", capability: "image" })).toBe(
|
||||
"MiniMax-VL-01",
|
||||
);
|
||||
expect(resolveDefaultMediaModel({ providerId: "openai-codex", capability: "image" })).toBe(
|
||||
"gpt-5.4",
|
||||
);
|
||||
expect(resolveDefaultMediaModel({ providerId: "moonshot", capability: "image" })).toBe(
|
||||
"kimi-k2.5",
|
||||
);
|
||||
expect(resolveDefaultMediaModel({ providerId: "openrouter", capability: "image" })).toBe(
|
||||
"auto",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("AUTO_AUDIO_KEY_PROVIDERS", () => {
|
||||
it("includes mistral auto key resolution", () => {
|
||||
expect(AUTO_AUDIO_KEY_PROVIDERS).toContain("mistral");
|
||||
describe("resolveAutoMediaKeyProviders", () => {
|
||||
it("keeps the bundled audio fallback order", () => {
|
||||
expect(resolveAutoMediaKeyProviders({ capability: "audio" })).toEqual([
|
||||
"openai",
|
||||
"groq",
|
||||
"deepgram",
|
||||
"google",
|
||||
"mistral",
|
||||
]);
|
||||
});
|
||||
|
||||
it("keeps the bundled image fallback order", () => {
|
||||
expect(resolveAutoMediaKeyProviders({ capability: "image" })).toEqual([
|
||||
"openai",
|
||||
"anthropic",
|
||||
"google",
|
||||
"minimax",
|
||||
"minimax-portal",
|
||||
"zai",
|
||||
]);
|
||||
});
|
||||
|
||||
it("keeps the bundled video fallback order", () => {
|
||||
expect(resolveAutoMediaKeyProviders({ capability: "video" })).toEqual(["google", "moonshot"]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("AUTO_VIDEO_KEY_PROVIDERS", () => {
|
||||
it("includes moonshot auto key resolution", () => {
|
||||
expect(AUTO_VIDEO_KEY_PROVIDERS).toContain("moonshot");
|
||||
});
|
||||
});
|
||||
|
||||
describe("AUTO_IMAGE_KEY_PROVIDERS", () => {
|
||||
it("includes minimax-portal auto key resolution", () => {
|
||||
expect(AUTO_IMAGE_KEY_PROVIDERS).toContain("minimax-portal");
|
||||
});
|
||||
});
|
||||
|
||||
describe("DEFAULT_IMAGE_MODELS", () => {
|
||||
it("includes the MiniMax portal vision default", () => {
|
||||
expect(DEFAULT_IMAGE_MODELS["minimax-portal"]).toBe("MiniMax-VL-01");
|
||||
});
|
||||
|
||||
it("includes bundled image-provider defaults beyond the core provider set", () => {
|
||||
expect(DEFAULT_IMAGE_MODELS["openai-codex"]).toBe("gpt-5.4");
|
||||
expect(DEFAULT_IMAGE_MODELS.moonshot).toBe("kimi-k2.5");
|
||||
expect(DEFAULT_IMAGE_MODELS.openrouter).toBe("auto");
|
||||
describe("providerSupportsNativePdfDocument", () => {
|
||||
it("reads native PDF support from provider metadata", () => {
|
||||
expect(providerSupportsNativePdfDocument({ providerId: "anthropic" })).toBe(true);
|
||||
expect(providerSupportsNativePdfDocument({ providerId: "google" })).toBe(true);
|
||||
expect(providerSupportsNativePdfDocument({ providerId: "openai" })).toBe(false);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,4 +1,6 @@
|
|||
import type { MediaUnderstandingCapability } from "./types.js";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { buildMediaUnderstandingRegistry, normalizeMediaProviderId } from "./provider-registry.js";
|
||||
import type { MediaUnderstandingCapability, MediaUnderstandingProvider } from "./types.js";
|
||||
|
||||
const MB = 1024 * 1024;
|
||||
|
||||
|
|
@ -27,43 +29,79 @@ export const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string> = {
|
|||
video: "Describe the video.",
|
||||
};
|
||||
export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
|
||||
export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
|
||||
groq: "whisper-large-v3-turbo",
|
||||
openai: "gpt-4o-mini-transcribe",
|
||||
deepgram: "nova-3",
|
||||
mistral: "voxtral-mini-latest",
|
||||
};
|
||||
|
||||
export const AUTO_AUDIO_KEY_PROVIDERS = [
|
||||
"openai",
|
||||
"groq",
|
||||
"deepgram",
|
||||
"google",
|
||||
"mistral",
|
||||
] as const;
|
||||
export const AUTO_IMAGE_KEY_PROVIDERS = [
|
||||
"openai",
|
||||
"anthropic",
|
||||
"google",
|
||||
"minimax",
|
||||
"minimax-portal",
|
||||
"zai",
|
||||
] as const;
|
||||
export const AUTO_VIDEO_KEY_PROVIDERS = ["google", "moonshot"] as const;
|
||||
export const DEFAULT_IMAGE_MODELS: Record<string, string> = {
|
||||
openai: "gpt-5-mini",
|
||||
"openai-codex": "gpt-5.4",
|
||||
anthropic: "claude-opus-4-6",
|
||||
google: "gemini-3-flash-preview",
|
||||
minimax: "MiniMax-VL-01",
|
||||
"minimax-portal": "MiniMax-VL-01",
|
||||
moonshot: "kimi-k2.5",
|
||||
openrouter: "auto",
|
||||
zai: "glm-4.6v",
|
||||
};
|
||||
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
|
||||
export const DEFAULT_MEDIA_CONCURRENCY = 2;
|
||||
|
||||
function providerSupportsCapability(
|
||||
provider: MediaUnderstandingProvider | undefined,
|
||||
capability: MediaUnderstandingCapability,
|
||||
): boolean {
|
||||
if (!provider) {
|
||||
return false;
|
||||
}
|
||||
if (capability === "audio") {
|
||||
return Boolean(provider.transcribeAudio);
|
||||
}
|
||||
if (capability === "image") {
|
||||
return Boolean(provider.describeImage);
|
||||
}
|
||||
return Boolean(provider.describeVideo);
|
||||
}
|
||||
|
||||
function resolveDefaultRegistry(cfg?: OpenClawConfig) {
|
||||
return buildMediaUnderstandingRegistry(undefined, cfg ?? ({} as OpenClawConfig));
|
||||
}
|
||||
|
||||
export function resolveDefaultMediaModel(params: {
|
||||
providerId: string;
|
||||
capability: MediaUnderstandingCapability;
|
||||
cfg?: OpenClawConfig;
|
||||
providerRegistry?: Map<string, MediaUnderstandingProvider>;
|
||||
}): string | undefined {
|
||||
const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg);
|
||||
const provider = registry.get(normalizeMediaProviderId(params.providerId));
|
||||
return provider?.defaultModels?.[params.capability]?.trim() || undefined;
|
||||
}
|
||||
|
||||
export function resolveAutoMediaKeyProviders(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
cfg?: OpenClawConfig;
|
||||
providerRegistry?: Map<string, MediaUnderstandingProvider>;
|
||||
}): string[] {
|
||||
const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg);
|
||||
type AutoProviderEntry = {
|
||||
provider: MediaUnderstandingProvider;
|
||||
priority: number;
|
||||
};
|
||||
return [...registry.values()]
|
||||
.filter((provider) => providerSupportsCapability(provider, params.capability))
|
||||
.map((provider): AutoProviderEntry | null => {
|
||||
const priority = provider.autoPriority?.[params.capability];
|
||||
return typeof priority === "number" && Number.isFinite(priority)
|
||||
? { provider, priority }
|
||||
: null;
|
||||
})
|
||||
.filter((entry): entry is AutoProviderEntry => entry !== null)
|
||||
.toSorted((left, right) => {
|
||||
if (left.priority !== right.priority) {
|
||||
return left.priority - right.priority;
|
||||
}
|
||||
return left.provider.id.localeCompare(right.provider.id);
|
||||
})
|
||||
.map((entry) => normalizeMediaProviderId(entry.provider.id))
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
export function providerSupportsNativePdfDocument(params: {
|
||||
providerId: string;
|
||||
cfg?: OpenClawConfig;
|
||||
providerRegistry?: Map<string, MediaUnderstandingProvider>;
|
||||
}): boolean {
|
||||
const registry = params.providerRegistry ?? resolveDefaultRegistry(params.cfg);
|
||||
const provider = registry.get(normalizeMediaProviderId(params.providerId));
|
||||
return provider?.nativeDocumentInputs?.includes("pdf") ?? false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Minimum audio file size in bytes below which transcription is skipped.
|
||||
* Files smaller than this threshold are almost certainly empty or corrupt
|
||||
|
|
|
|||
|
|
@ -14,6 +14,9 @@ function mergeProviderIntoRegistry(
|
|||
...existing,
|
||||
...provider,
|
||||
capabilities: provider.capabilities ?? existing.capabilities,
|
||||
defaultModels: provider.defaultModels ?? existing.defaultModels,
|
||||
autoPriority: provider.autoPriority ?? existing.autoPriority,
|
||||
nativeDocumentInputs: provider.nativeDocumentInputs ?? existing.nativeDocumentInputs,
|
||||
}
|
||||
: provider;
|
||||
registry.set(normalizedKey, merged);
|
||||
|
|
@ -41,6 +44,9 @@ export function buildMediaUnderstandingRegistry(
|
|||
...existing,
|
||||
...provider,
|
||||
capabilities: provider.capabilities ?? existing.capabilities,
|
||||
defaultModels: provider.defaultModels ?? existing.defaultModels,
|
||||
autoPriority: provider.autoPriority ?? existing.autoPriority,
|
||||
nativeDocumentInputs: provider.nativeDocumentInputs ?? existing.nativeDocumentInputs,
|
||||
}
|
||||
: provider;
|
||||
registry.set(normalizedKey, merged);
|
||||
|
|
|
|||
|
|
@ -24,9 +24,9 @@ import { runExec } from "../process/exec.js";
|
|||
import { MediaAttachmentCache } from "./attachments.js";
|
||||
import {
|
||||
CLI_OUTPUT_MAX_BUFFER,
|
||||
DEFAULT_AUDIO_MODELS,
|
||||
DEFAULT_TIMEOUT_SECONDS,
|
||||
MIN_AUDIO_FILE_BYTES,
|
||||
resolveDefaultMediaModel,
|
||||
} from "./defaults.js";
|
||||
import { MediaUnderstandingSkipError } from "./errors.js";
|
||||
import { fileExists } from "./fs.js";
|
||||
|
|
@ -548,7 +548,14 @@ export async function runProviderEntry(params: {
|
|||
config: params.config,
|
||||
entry,
|
||||
});
|
||||
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
|
||||
const model =
|
||||
entry.model?.trim() ||
|
||||
resolveDefaultMediaModel({
|
||||
cfg,
|
||||
providerId,
|
||||
capability: "audio",
|
||||
}) ||
|
||||
entry.model;
|
||||
const result = await executeWithApiKeyRotation({
|
||||
provider: providerId,
|
||||
apiKeys,
|
||||
|
|
|
|||
|
|
@ -25,12 +25,7 @@ import { mergeInboundPathRoots } from "../media/inbound-path-policy.js";
|
|||
import { getDefaultMediaLocalRoots } from "../media/local-roots.js";
|
||||
import { runExec } from "../process/exec.js";
|
||||
import { MediaAttachmentCache, selectAttachments } from "./attachments.js";
|
||||
import {
|
||||
AUTO_AUDIO_KEY_PROVIDERS,
|
||||
AUTO_IMAGE_KEY_PROVIDERS,
|
||||
AUTO_VIDEO_KEY_PROVIDERS,
|
||||
DEFAULT_IMAGE_MODELS,
|
||||
} from "./defaults.js";
|
||||
import { resolveAutoMediaKeyProviders, resolveDefaultMediaModel } from "./defaults.js";
|
||||
import { isMediaUnderstandingSkipError } from "./errors.js";
|
||||
import { fileExists } from "./fs.js";
|
||||
import { extractGeminiResponse } from "./output-extract.js";
|
||||
|
|
@ -152,7 +147,11 @@ async function resolveAutoImageModelId(params: {
|
|||
if (configuredModel) {
|
||||
return configuredModel;
|
||||
}
|
||||
const defaultModel = DEFAULT_IMAGE_MODELS[params.providerId];
|
||||
const defaultModel = resolveDefaultMediaModel({
|
||||
cfg: params.cfg,
|
||||
providerId: params.providerId,
|
||||
capability: "image",
|
||||
});
|
||||
if (defaultModel) {
|
||||
return defaultModel;
|
||||
}
|
||||
|
|
@ -470,7 +469,11 @@ async function resolveKeyEntry(params: {
|
|||
cfg,
|
||||
providerRegistry,
|
||||
capability,
|
||||
fallbackProviders: AUTO_IMAGE_KEY_PROVIDERS,
|
||||
fallbackProviders: resolveAutoMediaKeyProviders({
|
||||
cfg,
|
||||
capability,
|
||||
providerRegistry,
|
||||
}),
|
||||
})) {
|
||||
const entry = await checkProvider(providerId);
|
||||
if (entry) {
|
||||
|
|
@ -492,7 +495,11 @@ async function resolveKeyEntry(params: {
|
|||
cfg,
|
||||
providerRegistry,
|
||||
capability,
|
||||
fallbackProviders: AUTO_VIDEO_KEY_PROVIDERS,
|
||||
fallbackProviders: resolveAutoMediaKeyProviders({
|
||||
cfg,
|
||||
capability,
|
||||
providerRegistry,
|
||||
}),
|
||||
})) {
|
||||
const entry = await checkProvider(providerId, undefined);
|
||||
if (entry) {
|
||||
|
|
@ -513,7 +520,11 @@ async function resolveKeyEntry(params: {
|
|||
cfg,
|
||||
providerRegistry,
|
||||
capability,
|
||||
fallbackProviders: AUTO_AUDIO_KEY_PROVIDERS,
|
||||
fallbackProviders: resolveAutoMediaKeyProviders({
|
||||
cfg,
|
||||
capability,
|
||||
providerRegistry,
|
||||
}),
|
||||
})) {
|
||||
const entry = await checkProvider(providerId, undefined);
|
||||
if (entry) {
|
||||
|
|
|
|||
|
|
@ -137,6 +137,9 @@ export type ImagesDescriptionResult = {
|
|||
export type MediaUnderstandingProvider = {
|
||||
id: string;
|
||||
capabilities?: MediaUnderstandingCapability[];
|
||||
defaultModels?: Partial<Record<MediaUnderstandingCapability, string>>;
|
||||
autoPriority?: Partial<Record<MediaUnderstandingCapability, number>>;
|
||||
nativeDocumentInputs?: Array<"pdf">;
|
||||
transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
|
||||
describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
|
||||
describeImage?: (req: ImageDescriptionRequest) => Promise<ImageDescriptionResult>;
|
||||
|
|
|
|||
Loading…
Reference in New Issue