mirror of https://github.com/openclaw/openclaw.git
Media: extract runtime auto and orchestration
This commit is contained in:
parent
d303a7b6af
commit
37ee04e9b0
|
|
@ -25,7 +25,7 @@ import {
|
|||
isNonSecretApiKeyMarker,
|
||||
OLLAMA_LOCAL_AUTH_MARKER,
|
||||
} from "./model-auth-markers.js";
|
||||
import { normalizeProviderId } from "./model-selection.js";
|
||||
import { normalizeProviderId } from "./provider-id.js";
|
||||
|
||||
export { ensureAuthProfileStore, resolveAuthProfileOrder } from "./auth-profiles.js";
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,52 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { DEFAULT_IMAGE_MODELS } from "../media-understanding/defaults.js";
|
||||
import { resolveAutoImageModel } from "./media-runtime-auto.js";
|
||||
import { buildExtensionHostMediaUnderstandingRegistry } from "./media-runtime-registry.js";
|
||||
|
||||
function createImageCfg(): OpenClawConfig {
|
||||
return {
|
||||
models: {
|
||||
providers: {
|
||||
openai: {
|
||||
apiKey: "test-key",
|
||||
models: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as unknown as OpenClawConfig;
|
||||
}
|
||||
|
||||
describe("media runtime auto image model", () => {
|
||||
it("keeps a valid active image model", async () => {
|
||||
const result = await resolveAutoImageModel({
|
||||
cfg: createImageCfg(),
|
||||
providerRegistry: buildExtensionHostMediaUnderstandingRegistry(),
|
||||
activeModel: {
|
||||
provider: "openai",
|
||||
model: "gpt-4.1-mini",
|
||||
},
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
provider: "openai",
|
||||
model: "gpt-4.1-mini",
|
||||
});
|
||||
});
|
||||
|
||||
it("falls back to the default keyed image model when the active model cannot be used", async () => {
|
||||
const result = await resolveAutoImageModel({
|
||||
cfg: createImageCfg(),
|
||||
providerRegistry: buildExtensionHostMediaUnderstandingRegistry(),
|
||||
activeModel: {
|
||||
provider: "missing-provider",
|
||||
model: "ignored",
|
||||
},
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
provider: "openai",
|
||||
model: DEFAULT_IMAGE_MODELS.openai,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,499 @@
|
|||
import { constants as fsConstants } from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import {
|
||||
resolveAgentModelFallbackValues,
|
||||
resolveAgentModelPrimaryValue,
|
||||
} from "../config/model-input.js";
|
||||
import type { MediaUnderstandingModelConfig } from "../config/types.tools.js";
|
||||
import {
|
||||
getExtensionHostMediaUnderstandingProvider,
|
||||
normalizeExtensionHostMediaProviderId,
|
||||
type ExtensionHostMediaUnderstandingProviderRegistry,
|
||||
} from "../extension-host/media-runtime-registry.js";
|
||||
import {
|
||||
AUTO_AUDIO_KEY_PROVIDERS,
|
||||
AUTO_IMAGE_KEY_PROVIDERS,
|
||||
AUTO_VIDEO_KEY_PROVIDERS,
|
||||
DEFAULT_IMAGE_MODELS,
|
||||
} from "../media-understanding/defaults.js";
|
||||
import { fileExists } from "../media-understanding/fs.js";
|
||||
import { extractGeminiResponse } from "../media-understanding/output-extract.js";
|
||||
import type { MediaUnderstandingCapability } from "../media-understanding/types.js";
|
||||
import { runExec } from "../process/exec.js";
|
||||
|
||||
export type ActiveMediaModel = {
|
||||
provider: string;
|
||||
model?: string;
|
||||
};
|
||||
|
||||
type ProviderRegistry = ExtensionHostMediaUnderstandingProviderRegistry;
|
||||
|
||||
const binaryCache = new Map<string, Promise<string | null>>();
|
||||
const geminiProbeCache = new Map<string, Promise<boolean>>();
|
||||
|
||||
export function clearMediaUnderstandingBinaryCacheForTests(): void {
|
||||
binaryCache.clear();
|
||||
geminiProbeCache.clear();
|
||||
}
|
||||
|
||||
function expandHomeDir(value: string): string {
|
||||
if (!value.startsWith("~")) {
|
||||
return value;
|
||||
}
|
||||
const home = os.homedir();
|
||||
if (value === "~") {
|
||||
return home;
|
||||
}
|
||||
if (value.startsWith("~/")) {
|
||||
return path.join(home, value.slice(2));
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function hasPathSeparator(value: string): boolean {
|
||||
return value.includes("/") || value.includes("\\");
|
||||
}
|
||||
|
||||
function candidateBinaryNames(name: string): string[] {
|
||||
if (process.platform !== "win32") {
|
||||
return [name];
|
||||
}
|
||||
const ext = path.extname(name);
|
||||
if (ext) {
|
||||
return [name];
|
||||
}
|
||||
const pathext = (process.env.PATHEXT ?? ".EXE;.CMD;.BAT;.COM")
|
||||
.split(";")
|
||||
.map((item) => item.trim())
|
||||
.filter(Boolean)
|
||||
.map((item) => (item.startsWith(".") ? item : `.${item}`));
|
||||
const unique = Array.from(new Set(pathext));
|
||||
return [name, ...unique.map((item) => `${name}${item}`)];
|
||||
}
|
||||
|
||||
async function isExecutable(filePath: string): Promise<boolean> {
|
||||
try {
|
||||
const stat = await fs.stat(filePath);
|
||||
if (!stat.isFile()) {
|
||||
return false;
|
||||
}
|
||||
if (process.platform === "win32") {
|
||||
return true;
|
||||
}
|
||||
await fs.access(filePath, fsConstants.X_OK);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function findBinary(name: string): Promise<string | null> {
|
||||
const cached = binaryCache.get(name);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
const resolved = (async () => {
|
||||
const direct = expandHomeDir(name.trim());
|
||||
if (direct && hasPathSeparator(direct)) {
|
||||
for (const candidate of candidateBinaryNames(direct)) {
|
||||
if (await isExecutable(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const searchName = name.trim();
|
||||
if (!searchName) {
|
||||
return null;
|
||||
}
|
||||
const pathEntries = (process.env.PATH ?? "").split(path.delimiter);
|
||||
const candidates = candidateBinaryNames(searchName);
|
||||
for (const entryRaw of pathEntries) {
|
||||
const entry = expandHomeDir(entryRaw.trim().replace(/^"(.*)"$/, "$1"));
|
||||
if (!entry) {
|
||||
continue;
|
||||
}
|
||||
for (const candidate of candidates) {
|
||||
const fullPath = path.join(entry, candidate);
|
||||
if (await isExecutable(fullPath)) {
|
||||
return fullPath;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
})();
|
||||
binaryCache.set(name, resolved);
|
||||
return resolved;
|
||||
}
|
||||
|
||||
async function hasBinary(name: string): Promise<boolean> {
|
||||
return Boolean(await findBinary(name));
|
||||
}
|
||||
|
||||
async function probeGeminiCli(): Promise<boolean> {
|
||||
const cached = geminiProbeCache.get("gemini");
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
const resolved = (async () => {
|
||||
if (!(await hasBinary("gemini"))) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
const { stdout } = await runExec("gemini", ["--output-format", "json", "ok"], {
|
||||
timeoutMs: 8000,
|
||||
});
|
||||
return Boolean(extractGeminiResponse(stdout) ?? stdout.toLowerCase().includes("ok"));
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
})();
|
||||
geminiProbeCache.set("gemini", resolved);
|
||||
return resolved;
|
||||
}
|
||||
|
||||
async function resolveLocalWhisperCppEntry(): Promise<MediaUnderstandingModelConfig | null> {
|
||||
if (!(await hasBinary("whisper-cli"))) {
|
||||
return null;
|
||||
}
|
||||
const envModel = process.env.WHISPER_CPP_MODEL?.trim();
|
||||
const defaultModel = "/opt/homebrew/share/whisper-cpp/for-tests-ggml-tiny.bin";
|
||||
const modelPath = envModel && (await fileExists(envModel)) ? envModel : defaultModel;
|
||||
if (!(await fileExists(modelPath))) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
type: "cli",
|
||||
command: "whisper-cli",
|
||||
args: ["-m", modelPath, "-otxt", "-of", "{{OutputBase}}", "-np", "-nt", "{{MediaPath}}"],
|
||||
};
|
||||
}
|
||||
|
||||
async function resolveLocalWhisperEntry(): Promise<MediaUnderstandingModelConfig | null> {
|
||||
if (!(await hasBinary("whisper"))) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
type: "cli",
|
||||
command: "whisper",
|
||||
args: [
|
||||
"--model",
|
||||
"turbo",
|
||||
"--output_format",
|
||||
"txt",
|
||||
"--output_dir",
|
||||
"{{OutputDir}}",
|
||||
"--verbose",
|
||||
"False",
|
||||
"{{MediaPath}}",
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
async function resolveSherpaOnnxEntry(): Promise<MediaUnderstandingModelConfig | null> {
|
||||
if (!(await hasBinary("sherpa-onnx-offline"))) {
|
||||
return null;
|
||||
}
|
||||
const modelDir = process.env.SHERPA_ONNX_MODEL_DIR?.trim();
|
||||
if (!modelDir) {
|
||||
return null;
|
||||
}
|
||||
const tokens = path.join(modelDir, "tokens.txt");
|
||||
const encoder = path.join(modelDir, "encoder.onnx");
|
||||
const decoder = path.join(modelDir, "decoder.onnx");
|
||||
const joiner = path.join(modelDir, "joiner.onnx");
|
||||
if (!(await fileExists(tokens))) {
|
||||
return null;
|
||||
}
|
||||
if (!(await fileExists(encoder))) {
|
||||
return null;
|
||||
}
|
||||
if (!(await fileExists(decoder))) {
|
||||
return null;
|
||||
}
|
||||
if (!(await fileExists(joiner))) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
type: "cli",
|
||||
command: "sherpa-onnx-offline",
|
||||
args: [
|
||||
`--tokens=${tokens}`,
|
||||
`--encoder=${encoder}`,
|
||||
`--decoder=${decoder}`,
|
||||
`--joiner=${joiner}`,
|
||||
"{{MediaPath}}",
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
async function resolveLocalAudioEntry(): Promise<MediaUnderstandingModelConfig | null> {
|
||||
const sherpa = await resolveSherpaOnnxEntry();
|
||||
if (sherpa) {
|
||||
return sherpa;
|
||||
}
|
||||
const whisperCpp = await resolveLocalWhisperCppEntry();
|
||||
if (whisperCpp) {
|
||||
return whisperCpp;
|
||||
}
|
||||
return await resolveLocalWhisperEntry();
|
||||
}
|
||||
|
||||
async function resolveGeminiCliEntry(
|
||||
_capability: MediaUnderstandingCapability,
|
||||
): Promise<MediaUnderstandingModelConfig | null> {
|
||||
if (!(await probeGeminiCli())) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
type: "cli",
|
||||
command: "gemini",
|
||||
args: [
|
||||
"--output-format",
|
||||
"json",
|
||||
"--allowed-tools",
|
||||
"read_many_files",
|
||||
"--include-directories",
|
||||
"{{MediaDir}}",
|
||||
"{{Prompt}}",
|
||||
"Use read_many_files to read {{MediaPath}} and respond with only the text output.",
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
async function resolveActiveModelEntry(params: {
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
capability: MediaUnderstandingCapability;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<MediaUnderstandingModelConfig | null> {
|
||||
const activeProviderRaw = params.activeModel?.provider?.trim();
|
||||
if (!activeProviderRaw) {
|
||||
return null;
|
||||
}
|
||||
const providerId = normalizeExtensionHostMediaProviderId(activeProviderRaw);
|
||||
if (!providerId) {
|
||||
return null;
|
||||
}
|
||||
const provider = getExtensionHostMediaUnderstandingProvider(providerId, params.providerRegistry);
|
||||
if (!provider) {
|
||||
return null;
|
||||
}
|
||||
if (params.capability === "audio" && !provider.transcribeAudio) {
|
||||
return null;
|
||||
}
|
||||
if (params.capability === "image" && !provider.describeImage) {
|
||||
return null;
|
||||
}
|
||||
if (params.capability === "video" && !provider.describeVideo) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
await resolveApiKeyForProvider({
|
||||
provider: providerId,
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
type: "provider",
|
||||
provider: providerId,
|
||||
model: params.activeModel?.model,
|
||||
};
|
||||
}
|
||||
|
||||
async function resolveKeyEntry(params: {
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
capability: MediaUnderstandingCapability;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<MediaUnderstandingModelConfig | null> {
|
||||
const { cfg, agentDir, providerRegistry, capability } = params;
|
||||
const checkProvider = async (
|
||||
providerId: string,
|
||||
model?: string,
|
||||
): Promise<MediaUnderstandingModelConfig | null> => {
|
||||
const provider = getExtensionHostMediaUnderstandingProvider(providerId, providerRegistry);
|
||||
if (!provider) {
|
||||
return null;
|
||||
}
|
||||
if (capability === "audio" && !provider.transcribeAudio) {
|
||||
return null;
|
||||
}
|
||||
if (capability === "image" && !provider.describeImage) {
|
||||
return null;
|
||||
}
|
||||
if (capability === "video" && !provider.describeVideo) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
await resolveApiKeyForProvider({ provider: providerId, cfg, agentDir });
|
||||
return { type: "provider", provider: providerId, model };
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
if (capability === "image") {
|
||||
const activeProvider = params.activeModel?.provider?.trim();
|
||||
if (activeProvider) {
|
||||
const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
|
||||
if (activeEntry) {
|
||||
return activeEntry;
|
||||
}
|
||||
}
|
||||
for (const providerId of AUTO_IMAGE_KEY_PROVIDERS) {
|
||||
const model = DEFAULT_IMAGE_MODELS[providerId];
|
||||
const entry = await checkProvider(providerId, model);
|
||||
if (entry) {
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
if (capability === "video") {
|
||||
const activeProvider = params.activeModel?.provider?.trim();
|
||||
if (activeProvider) {
|
||||
const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
|
||||
if (activeEntry) {
|
||||
return activeEntry;
|
||||
}
|
||||
}
|
||||
for (const providerId of AUTO_VIDEO_KEY_PROVIDERS) {
|
||||
const entry = await checkProvider(providerId, undefined);
|
||||
if (entry) {
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
const activeProvider = params.activeModel?.provider?.trim();
|
||||
if (activeProvider) {
|
||||
const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
|
||||
if (activeEntry) {
|
||||
return activeEntry;
|
||||
}
|
||||
}
|
||||
for (const providerId of AUTO_AUDIO_KEY_PROVIDERS) {
|
||||
const entry = await checkProvider(providerId, undefined);
|
||||
if (entry) {
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function resolveImageModelFromAgentDefaults(cfg: OpenClawConfig): MediaUnderstandingModelConfig[] {
|
||||
const refs: string[] = [];
|
||||
const primary = resolveAgentModelPrimaryValue(cfg.agents?.defaults?.imageModel);
|
||||
if (primary?.trim()) {
|
||||
refs.push(primary.trim());
|
||||
}
|
||||
for (const fb of resolveAgentModelFallbackValues(cfg.agents?.defaults?.imageModel)) {
|
||||
if (fb?.trim()) {
|
||||
refs.push(fb.trim());
|
||||
}
|
||||
}
|
||||
if (refs.length === 0) {
|
||||
return [];
|
||||
}
|
||||
const entries: MediaUnderstandingModelConfig[] = [];
|
||||
for (const ref of refs) {
|
||||
const slashIdx = ref.indexOf("/");
|
||||
if (slashIdx <= 0 || slashIdx >= ref.length - 1) {
|
||||
continue;
|
||||
}
|
||||
entries.push({
|
||||
type: "provider",
|
||||
provider: ref.slice(0, slashIdx),
|
||||
model: ref.slice(slashIdx + 1),
|
||||
});
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
export async function resolveAutoEntries(params: {
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
capability: MediaUnderstandingCapability;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<MediaUnderstandingModelConfig[]> {
|
||||
const activeEntry = await resolveActiveModelEntry(params);
|
||||
if (activeEntry) {
|
||||
return [activeEntry];
|
||||
}
|
||||
if (params.capability === "audio") {
|
||||
const localAudio = await resolveLocalAudioEntry();
|
||||
if (localAudio) {
|
||||
return [localAudio];
|
||||
}
|
||||
}
|
||||
if (params.capability === "image") {
|
||||
const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg);
|
||||
if (imageModelEntries.length > 0) {
|
||||
return imageModelEntries;
|
||||
}
|
||||
}
|
||||
const gemini = await resolveGeminiCliEntry(params.capability);
|
||||
if (gemini) {
|
||||
return [gemini];
|
||||
}
|
||||
const keys = await resolveKeyEntry(params);
|
||||
if (keys) {
|
||||
return [keys];
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
export async function resolveAutoImageModel(params: {
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
activeModel?: ActiveMediaModel;
|
||||
providerRegistry: ProviderRegistry;
|
||||
}): Promise<ActiveMediaModel | null> {
|
||||
const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => {
|
||||
if (!entry || entry.type === "cli") {
|
||||
return null;
|
||||
}
|
||||
const provider = entry.provider;
|
||||
if (!provider) {
|
||||
return null;
|
||||
}
|
||||
const model = entry.model ?? DEFAULT_IMAGE_MODELS[provider];
|
||||
if (!model) {
|
||||
return null;
|
||||
}
|
||||
return { provider, model };
|
||||
};
|
||||
const activeEntry = await resolveActiveModelEntry({
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
capability: "image",
|
||||
activeModel: params.activeModel,
|
||||
});
|
||||
const resolvedActive = toActive(activeEntry);
|
||||
if (resolvedActive) {
|
||||
return resolvedActive;
|
||||
}
|
||||
const keyEntry = await resolveKeyEntry({
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
capability: "image",
|
||||
activeModel: params.activeModel,
|
||||
});
|
||||
return toActive(keyEntry);
|
||||
}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
import { describe, expect, it, vi } from "vitest";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import {
|
||||
createMediaAttachmentCache,
|
||||
normalizeMediaAttachments,
|
||||
} from "../media-understanding/runner.js";
|
||||
import { runCapability } from "./media-runtime-orchestration.js";
|
||||
import { buildExtensionHostMediaUnderstandingRegistry } from "./media-runtime-registry.js";
|
||||
|
||||
const catalog = [
|
||||
{
|
||||
id: "gpt-4.1",
|
||||
name: "GPT-4.1",
|
||||
provider: "openai",
|
||||
input: ["text", "image"] as const,
|
||||
},
|
||||
];
|
||||
|
||||
vi.mock("../agents/model-catalog.js", async () => {
|
||||
const actual = await vi.importActual<typeof import("../agents/model-catalog.js")>(
|
||||
"../agents/model-catalog.js",
|
||||
);
|
||||
return {
|
||||
...actual,
|
||||
loadModelCatalog: vi.fn(async () => catalog),
|
||||
};
|
||||
});
|
||||
|
||||
describe("media runtime orchestration", () => {
|
||||
it("skips image understanding when the active model already supports vision", async () => {
|
||||
const ctx: MsgContext = { MediaPath: "/tmp/image.png", MediaType: "image/png" };
|
||||
const media = normalizeMediaAttachments(ctx);
|
||||
const cache = createMediaAttachmentCache(media);
|
||||
const cfg = {} as OpenClawConfig;
|
||||
|
||||
try {
|
||||
const result = await runCapability({
|
||||
capability: "image",
|
||||
cfg,
|
||||
ctx,
|
||||
attachments: cache,
|
||||
media,
|
||||
providerRegistry: buildExtensionHostMediaUnderstandingRegistry(),
|
||||
activeModel: { provider: "openai", model: "gpt-4.1" },
|
||||
});
|
||||
|
||||
expect(result.outputs).toHaveLength(0);
|
||||
expect(result.decision.outcome).toBe("skipped");
|
||||
expect(result.decision.attachments).toHaveLength(1);
|
||||
expect(result.decision.attachments[0]?.attempts[0]?.reason).toBe(
|
||||
"primary model supports vision natively",
|
||||
);
|
||||
} finally {
|
||||
await cache.cleanup();
|
||||
}
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,272 @@
|
|||
import {
|
||||
findModelInCatalog,
|
||||
loadModelCatalog,
|
||||
modelSupportsVision,
|
||||
} from "../agents/model-catalog.js";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import type {
|
||||
MediaUnderstandingConfig,
|
||||
MediaUnderstandingModelConfig,
|
||||
} from "../config/types.tools.js";
|
||||
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
||||
import { MediaAttachmentCache, selectAttachments } from "../media-understanding/attachments.js";
|
||||
import { isMediaUnderstandingSkipError } from "../media-understanding/errors.js";
|
||||
import { resolveModelEntries, resolveScopeDecision } from "../media-understanding/resolve.js";
|
||||
import {
|
||||
buildModelDecision,
|
||||
formatDecisionSummary,
|
||||
runCliEntry,
|
||||
runProviderEntry,
|
||||
} from "../media-understanding/runner.entries.js";
|
||||
import type {
|
||||
MediaAttachment,
|
||||
MediaUnderstandingCapability,
|
||||
MediaUnderstandingDecision,
|
||||
MediaUnderstandingModelDecision,
|
||||
MediaUnderstandingOutput,
|
||||
MediaUnderstandingProvider,
|
||||
} from "../media-understanding/types.js";
|
||||
import { resolveAutoEntries, type ActiveMediaModel } from "./media-runtime-auto.js";
|
||||
|
||||
type ProviderRegistry = Map<string, MediaUnderstandingProvider>;
|
||||
|
||||
export type RunCapabilityResult = {
|
||||
outputs: MediaUnderstandingOutput[];
|
||||
decision: MediaUnderstandingDecision;
|
||||
};
|
||||
|
||||
async function runAttachmentEntries(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
cfg: OpenClawConfig;
|
||||
ctx: MsgContext;
|
||||
attachmentIndex: number;
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
cache: MediaAttachmentCache;
|
||||
entries: MediaUnderstandingModelConfig[];
|
||||
config?: MediaUnderstandingConfig;
|
||||
}): Promise<{
|
||||
output: MediaUnderstandingOutput | null;
|
||||
attempts: MediaUnderstandingModelDecision[];
|
||||
}> {
|
||||
const { entries, capability } = params;
|
||||
const attempts: MediaUnderstandingModelDecision[] = [];
|
||||
for (const entry of entries) {
|
||||
const entryType = entry.type ?? (entry.command ? "cli" : "provider");
|
||||
try {
|
||||
const result =
|
||||
entryType === "cli"
|
||||
? await runCliEntry({
|
||||
capability,
|
||||
entry,
|
||||
cfg: params.cfg,
|
||||
ctx: params.ctx,
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
cache: params.cache,
|
||||
config: params.config,
|
||||
})
|
||||
: await runProviderEntry({
|
||||
capability,
|
||||
entry,
|
||||
cfg: params.cfg,
|
||||
ctx: params.ctx,
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
cache: params.cache,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
config: params.config,
|
||||
});
|
||||
if (result) {
|
||||
const decision = buildModelDecision({ entry, entryType, outcome: "success" });
|
||||
if (result.provider) {
|
||||
decision.provider = result.provider;
|
||||
}
|
||||
if (result.model) {
|
||||
decision.model = result.model;
|
||||
}
|
||||
attempts.push(decision);
|
||||
return { output: result, attempts };
|
||||
}
|
||||
attempts.push(
|
||||
buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }),
|
||||
);
|
||||
} catch (err) {
|
||||
if (isMediaUnderstandingSkipError(err)) {
|
||||
attempts.push(
|
||||
buildModelDecision({
|
||||
entry,
|
||||
entryType,
|
||||
outcome: "skipped",
|
||||
reason: `${err.reason}: ${err.message}`,
|
||||
}),
|
||||
);
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
attempts.push(
|
||||
buildModelDecision({
|
||||
entry,
|
||||
entryType,
|
||||
outcome: "failed",
|
||||
reason: String(err),
|
||||
}),
|
||||
);
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`${capability} understanding failed: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { output: null, attempts };
|
||||
}
|
||||
|
||||
export async function runCapability(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
cfg: OpenClawConfig;
|
||||
ctx: MsgContext;
|
||||
attachments: MediaAttachmentCache;
|
||||
media: MediaAttachment[];
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
config?: MediaUnderstandingConfig;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<RunCapabilityResult> {
|
||||
const { capability, cfg, ctx } = params;
|
||||
const config = params.config ?? cfg.tools?.media?.[capability];
|
||||
if (config?.enabled === false) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: { capability, outcome: "disabled", attachments: [] },
|
||||
};
|
||||
}
|
||||
|
||||
const attachmentPolicy = config?.attachments;
|
||||
const selected = selectAttachments({
|
||||
capability,
|
||||
attachments: params.media,
|
||||
policy: attachmentPolicy,
|
||||
});
|
||||
if (selected.length === 0) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: { capability, outcome: "no-attachment", attachments: [] },
|
||||
};
|
||||
}
|
||||
|
||||
const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx });
|
||||
if (scopeDecision === "deny") {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`${capability} understanding disabled by scope policy.`);
|
||||
}
|
||||
return {
|
||||
outputs: [],
|
||||
decision: {
|
||||
capability,
|
||||
outcome: "scope-deny",
|
||||
attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Skip image understanding when the primary model supports vision natively.
|
||||
// The image will be injected directly into the model context instead.
|
||||
const activeProvider = params.activeModel?.provider?.trim();
|
||||
if (capability === "image" && activeProvider) {
|
||||
const catalog = await loadModelCatalog({ config: cfg });
|
||||
const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
|
||||
if (modelSupportsVision(entry)) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose("Skipping image understanding: primary model supports vision natively");
|
||||
}
|
||||
const model = params.activeModel?.model?.trim();
|
||||
const reason = "primary model supports vision natively";
|
||||
return {
|
||||
outputs: [],
|
||||
decision: {
|
||||
capability,
|
||||
outcome: "skipped",
|
||||
attachments: selected.map((item) => {
|
||||
const attempt = {
|
||||
type: "provider" as const,
|
||||
provider: activeProvider,
|
||||
model: model || undefined,
|
||||
outcome: "skipped" as const,
|
||||
reason,
|
||||
};
|
||||
return {
|
||||
attachmentIndex: item.index,
|
||||
attempts: [attempt],
|
||||
chosen: attempt,
|
||||
};
|
||||
}),
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const entries = resolveModelEntries({
|
||||
cfg,
|
||||
capability,
|
||||
config,
|
||||
providerRegistry: params.providerRegistry,
|
||||
});
|
||||
let resolvedEntries = entries;
|
||||
if (resolvedEntries.length === 0) {
|
||||
resolvedEntries = await resolveAutoEntries({
|
||||
cfg,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
capability,
|
||||
activeModel: params.activeModel,
|
||||
});
|
||||
}
|
||||
if (resolvedEntries.length === 0) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: {
|
||||
capability,
|
||||
outcome: "skipped",
|
||||
attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const outputs: MediaUnderstandingOutput[] = [];
|
||||
const attachmentDecisions: MediaUnderstandingDecision["attachments"] = [];
|
||||
for (const attachment of selected) {
|
||||
const { output, attempts } = await runAttachmentEntries({
|
||||
capability,
|
||||
cfg,
|
||||
ctx,
|
||||
attachmentIndex: attachment.index,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
cache: params.attachments,
|
||||
entries: resolvedEntries,
|
||||
config,
|
||||
});
|
||||
if (output) {
|
||||
outputs.push(output);
|
||||
}
|
||||
attachmentDecisions.push({
|
||||
attachmentIndex: attachment.index,
|
||||
attempts,
|
||||
chosen: attempts.find((attempt) => attempt.outcome === "success"),
|
||||
});
|
||||
}
|
||||
const decision: MediaUnderstandingDecision = {
|
||||
capability,
|
||||
outcome: outputs.length > 0 ? "success" : "skipped",
|
||||
attachments: attachmentDecisions,
|
||||
};
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Media understanding ${formatDecisionSummary(decision)}`);
|
||||
}
|
||||
return {
|
||||
outputs,
|
||||
decision,
|
||||
};
|
||||
}
|
||||
|
|
@ -1,77 +1,38 @@
|
|||
import { constants as fsConstants } from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
|
||||
import {
|
||||
findModelInCatalog,
|
||||
loadModelCatalog,
|
||||
modelSupportsVision,
|
||||
} from "../agents/model-catalog.js";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import type { MediaUnderstandingConfig } from "../config/types.tools.js";
|
||||
import {
|
||||
resolveAgentModelFallbackValues,
|
||||
resolveAgentModelPrimaryValue,
|
||||
} from "../config/model-input.js";
|
||||
import type {
|
||||
MediaUnderstandingConfig,
|
||||
MediaUnderstandingModelConfig,
|
||||
} from "../config/types.tools.js";
|
||||
clearMediaUnderstandingBinaryCacheForTests as clearExtensionHostMediaUnderstandingBinaryCacheForTests,
|
||||
resolveAutoImageModel as resolveExtensionHostAutoImageModel,
|
||||
type ActiveMediaModel,
|
||||
} from "../extension-host/media-runtime-auto.js";
|
||||
import {
|
||||
runCapability as runExtensionHostMediaCapability,
|
||||
type RunCapabilityResult,
|
||||
} from "../extension-host/media-runtime-orchestration.js";
|
||||
import {
|
||||
buildExtensionHostMediaUnderstandingRegistry,
|
||||
getExtensionHostMediaUnderstandingProvider,
|
||||
normalizeExtensionHostMediaProviderId,
|
||||
type ExtensionHostMediaUnderstandingProviderRegistry,
|
||||
} from "../extension-host/media-runtime-registry.js";
|
||||
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
||||
import {
|
||||
mergeInboundPathRoots,
|
||||
resolveIMessageAttachmentRoots,
|
||||
} from "../media/inbound-path-policy.js";
|
||||
import { getDefaultMediaLocalRoots } from "../media/local-roots.js";
|
||||
import { runExec } from "../process/exec.js";
|
||||
import {
|
||||
MediaAttachmentCache,
|
||||
type MediaAttachmentCacheOptions,
|
||||
normalizeAttachments,
|
||||
selectAttachments,
|
||||
} from "./attachments.js";
|
||||
import {
|
||||
AUTO_AUDIO_KEY_PROVIDERS,
|
||||
AUTO_IMAGE_KEY_PROVIDERS,
|
||||
AUTO_VIDEO_KEY_PROVIDERS,
|
||||
DEFAULT_IMAGE_MODELS,
|
||||
} from "./defaults.js";
|
||||
import { isMediaUnderstandingSkipError } from "./errors.js";
|
||||
import { fileExists } from "./fs.js";
|
||||
import { extractGeminiResponse } from "./output-extract.js";
|
||||
import { resolveModelEntries, resolveScopeDecision } from "./resolve.js";
|
||||
import {
|
||||
buildModelDecision,
|
||||
formatDecisionSummary,
|
||||
runCliEntry,
|
||||
runProviderEntry,
|
||||
} from "./runner.entries.js";
|
||||
import type {
|
||||
MediaAttachment,
|
||||
MediaUnderstandingCapability,
|
||||
MediaUnderstandingDecision,
|
||||
MediaUnderstandingModelDecision,
|
||||
MediaUnderstandingOutput,
|
||||
MediaUnderstandingProvider,
|
||||
} from "./types.js";
|
||||
|
||||
export type ActiveMediaModel = {
|
||||
provider: string;
|
||||
model?: string;
|
||||
};
|
||||
type ProviderRegistry = ExtensionHostMediaUnderstandingProviderRegistry;
|
||||
|
||||
type ProviderRegistry = Map<string, MediaUnderstandingProvider>;
|
||||
|
||||
export type RunCapabilityResult = {
|
||||
outputs: MediaUnderstandingOutput[];
|
||||
decision: MediaUnderstandingDecision;
|
||||
};
|
||||
export type { ActiveMediaModel, RunCapabilityResult };
|
||||
|
||||
export function buildProviderRegistry(
|
||||
overrides?: Record<string, MediaUnderstandingProvider>,
|
||||
|
|
@ -103,385 +64,8 @@ export function createMediaAttachmentCache(
|
|||
return new MediaAttachmentCache(attachments, options);
|
||||
}
|
||||
|
||||
const binaryCache = new Map<string, Promise<string | null>>();
|
||||
const geminiProbeCache = new Map<string, Promise<boolean>>();
|
||||
|
||||
export function clearMediaUnderstandingBinaryCacheForTests(): void {
|
||||
binaryCache.clear();
|
||||
geminiProbeCache.clear();
|
||||
}
|
||||
|
||||
function expandHomeDir(value: string): string {
|
||||
if (!value.startsWith("~")) {
|
||||
return value;
|
||||
}
|
||||
const home = os.homedir();
|
||||
if (value === "~") {
|
||||
return home;
|
||||
}
|
||||
if (value.startsWith("~/")) {
|
||||
return path.join(home, value.slice(2));
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function hasPathSeparator(value: string): boolean {
|
||||
return value.includes("/") || value.includes("\\");
|
||||
}
|
||||
|
||||
function candidateBinaryNames(name: string): string[] {
|
||||
if (process.platform !== "win32") {
|
||||
return [name];
|
||||
}
|
||||
const ext = path.extname(name);
|
||||
if (ext) {
|
||||
return [name];
|
||||
}
|
||||
const pathext = (process.env.PATHEXT ?? ".EXE;.CMD;.BAT;.COM")
|
||||
.split(";")
|
||||
.map((item) => item.trim())
|
||||
.filter(Boolean)
|
||||
.map((item) => (item.startsWith(".") ? item : `.${item}`));
|
||||
const unique = Array.from(new Set(pathext));
|
||||
return [name, ...unique.map((item) => `${name}${item}`)];
|
||||
}
|
||||
|
||||
async function isExecutable(filePath: string): Promise<boolean> {
|
||||
try {
|
||||
const stat = await fs.stat(filePath);
|
||||
if (!stat.isFile()) {
|
||||
return false;
|
||||
}
|
||||
if (process.platform === "win32") {
|
||||
return true;
|
||||
}
|
||||
await fs.access(filePath, fsConstants.X_OK);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function findBinary(name: string): Promise<string | null> {
|
||||
const cached = binaryCache.get(name);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
const resolved = (async () => {
|
||||
const direct = expandHomeDir(name.trim());
|
||||
if (direct && hasPathSeparator(direct)) {
|
||||
for (const candidate of candidateBinaryNames(direct)) {
|
||||
if (await isExecutable(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const searchName = name.trim();
|
||||
if (!searchName) {
|
||||
return null;
|
||||
}
|
||||
const pathEntries = (process.env.PATH ?? "").split(path.delimiter);
|
||||
const candidates = candidateBinaryNames(searchName);
|
||||
for (const entryRaw of pathEntries) {
|
||||
const entry = expandHomeDir(entryRaw.trim().replace(/^"(.*)"$/, "$1"));
|
||||
if (!entry) {
|
||||
continue;
|
||||
}
|
||||
for (const candidate of candidates) {
|
||||
const fullPath = path.join(entry, candidate);
|
||||
if (await isExecutable(fullPath)) {
|
||||
return fullPath;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
})();
|
||||
binaryCache.set(name, resolved);
|
||||
return resolved;
|
||||
}
|
||||
|
||||
async function hasBinary(name: string): Promise<boolean> {
|
||||
return Boolean(await findBinary(name));
|
||||
}
|
||||
|
||||
async function probeGeminiCli(): Promise<boolean> {
|
||||
const cached = geminiProbeCache.get("gemini");
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
const resolved = (async () => {
|
||||
if (!(await hasBinary("gemini"))) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
const { stdout } = await runExec("gemini", ["--output-format", "json", "ok"], {
|
||||
timeoutMs: 8000,
|
||||
});
|
||||
return Boolean(extractGeminiResponse(stdout) ?? stdout.toLowerCase().includes("ok"));
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
})();
|
||||
geminiProbeCache.set("gemini", resolved);
|
||||
return resolved;
|
||||
}
|
||||
|
||||
async function resolveLocalWhisperCppEntry(): Promise<MediaUnderstandingModelConfig | null> {
|
||||
if (!(await hasBinary("whisper-cli"))) {
|
||||
return null;
|
||||
}
|
||||
const envModel = process.env.WHISPER_CPP_MODEL?.trim();
|
||||
const defaultModel = "/opt/homebrew/share/whisper-cpp/for-tests-ggml-tiny.bin";
|
||||
const modelPath = envModel && (await fileExists(envModel)) ? envModel : defaultModel;
|
||||
if (!(await fileExists(modelPath))) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
type: "cli",
|
||||
command: "whisper-cli",
|
||||
args: ["-m", modelPath, "-otxt", "-of", "{{OutputBase}}", "-np", "-nt", "{{MediaPath}}"],
|
||||
};
|
||||
}
|
||||
|
||||
async function resolveLocalWhisperEntry(): Promise<MediaUnderstandingModelConfig | null> {
|
||||
if (!(await hasBinary("whisper"))) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
type: "cli",
|
||||
command: "whisper",
|
||||
args: [
|
||||
"--model",
|
||||
"turbo",
|
||||
"--output_format",
|
||||
"txt",
|
||||
"--output_dir",
|
||||
"{{OutputDir}}",
|
||||
"--verbose",
|
||||
"False",
|
||||
"{{MediaPath}}",
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
async function resolveSherpaOnnxEntry(): Promise<MediaUnderstandingModelConfig | null> {
|
||||
if (!(await hasBinary("sherpa-onnx-offline"))) {
|
||||
return null;
|
||||
}
|
||||
const modelDir = process.env.SHERPA_ONNX_MODEL_DIR?.trim();
|
||||
if (!modelDir) {
|
||||
return null;
|
||||
}
|
||||
const tokens = path.join(modelDir, "tokens.txt");
|
||||
const encoder = path.join(modelDir, "encoder.onnx");
|
||||
const decoder = path.join(modelDir, "decoder.onnx");
|
||||
const joiner = path.join(modelDir, "joiner.onnx");
|
||||
if (!(await fileExists(tokens))) {
|
||||
return null;
|
||||
}
|
||||
if (!(await fileExists(encoder))) {
|
||||
return null;
|
||||
}
|
||||
if (!(await fileExists(decoder))) {
|
||||
return null;
|
||||
}
|
||||
if (!(await fileExists(joiner))) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
type: "cli",
|
||||
command: "sherpa-onnx-offline",
|
||||
args: [
|
||||
`--tokens=${tokens}`,
|
||||
`--encoder=${encoder}`,
|
||||
`--decoder=${decoder}`,
|
||||
`--joiner=${joiner}`,
|
||||
"{{MediaPath}}",
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
async function resolveLocalAudioEntry(): Promise<MediaUnderstandingModelConfig | null> {
|
||||
const sherpa = await resolveSherpaOnnxEntry();
|
||||
if (sherpa) {
|
||||
return sherpa;
|
||||
}
|
||||
const whisperCpp = await resolveLocalWhisperCppEntry();
|
||||
if (whisperCpp) {
|
||||
return whisperCpp;
|
||||
}
|
||||
return await resolveLocalWhisperEntry();
|
||||
}
|
||||
|
||||
async function resolveGeminiCliEntry(
|
||||
_capability: MediaUnderstandingCapability,
|
||||
): Promise<MediaUnderstandingModelConfig | null> {
|
||||
if (!(await probeGeminiCli())) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
type: "cli",
|
||||
command: "gemini",
|
||||
args: [
|
||||
"--output-format",
|
||||
"json",
|
||||
"--allowed-tools",
|
||||
"read_many_files",
|
||||
"--include-directories",
|
||||
"{{MediaDir}}",
|
||||
"{{Prompt}}",
|
||||
"Use read_many_files to read {{MediaPath}} and respond with only the text output.",
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
async function resolveKeyEntry(params: {
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
capability: MediaUnderstandingCapability;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<MediaUnderstandingModelConfig | null> {
|
||||
const { cfg, agentDir, providerRegistry, capability } = params;
|
||||
const checkProvider = async (
|
||||
providerId: string,
|
||||
model?: string,
|
||||
): Promise<MediaUnderstandingModelConfig | null> => {
|
||||
const provider = getExtensionHostMediaUnderstandingProvider(providerId, providerRegistry);
|
||||
if (!provider) {
|
||||
return null;
|
||||
}
|
||||
if (capability === "audio" && !provider.transcribeAudio) {
|
||||
return null;
|
||||
}
|
||||
if (capability === "image" && !provider.describeImage) {
|
||||
return null;
|
||||
}
|
||||
if (capability === "video" && !provider.describeVideo) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
await resolveApiKeyForProvider({ provider: providerId, cfg, agentDir });
|
||||
return { type: "provider" as const, provider: providerId, model };
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
if (capability === "image") {
|
||||
const activeProvider = params.activeModel?.provider?.trim();
|
||||
if (activeProvider) {
|
||||
const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
|
||||
if (activeEntry) {
|
||||
return activeEntry;
|
||||
}
|
||||
}
|
||||
for (const providerId of AUTO_IMAGE_KEY_PROVIDERS) {
|
||||
const model = DEFAULT_IMAGE_MODELS[providerId];
|
||||
const entry = await checkProvider(providerId, model);
|
||||
if (entry) {
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
if (capability === "video") {
|
||||
const activeProvider = params.activeModel?.provider?.trim();
|
||||
if (activeProvider) {
|
||||
const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
|
||||
if (activeEntry) {
|
||||
return activeEntry;
|
||||
}
|
||||
}
|
||||
for (const providerId of AUTO_VIDEO_KEY_PROVIDERS) {
|
||||
const entry = await checkProvider(providerId, undefined);
|
||||
if (entry) {
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
const activeProvider = params.activeModel?.provider?.trim();
|
||||
if (activeProvider) {
|
||||
const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
|
||||
if (activeEntry) {
|
||||
return activeEntry;
|
||||
}
|
||||
}
|
||||
for (const providerId of AUTO_AUDIO_KEY_PROVIDERS) {
|
||||
const entry = await checkProvider(providerId, undefined);
|
||||
if (entry) {
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function resolveImageModelFromAgentDefaults(cfg: OpenClawConfig): MediaUnderstandingModelConfig[] {
|
||||
const refs: string[] = [];
|
||||
const primary = resolveAgentModelPrimaryValue(cfg.agents?.defaults?.imageModel);
|
||||
if (primary?.trim()) {
|
||||
refs.push(primary.trim());
|
||||
}
|
||||
for (const fb of resolveAgentModelFallbackValues(cfg.agents?.defaults?.imageModel)) {
|
||||
if (fb?.trim()) {
|
||||
refs.push(fb.trim());
|
||||
}
|
||||
}
|
||||
if (refs.length === 0) {
|
||||
return [];
|
||||
}
|
||||
const entries: MediaUnderstandingModelConfig[] = [];
|
||||
for (const ref of refs) {
|
||||
const slashIdx = ref.indexOf("/");
|
||||
if (slashIdx <= 0 || slashIdx >= ref.length - 1) {
|
||||
continue;
|
||||
}
|
||||
entries.push({
|
||||
type: "provider",
|
||||
provider: ref.slice(0, slashIdx),
|
||||
model: ref.slice(slashIdx + 1),
|
||||
});
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
async function resolveAutoEntries(params: {
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
capability: MediaUnderstandingCapability;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<MediaUnderstandingModelConfig[]> {
|
||||
const activeEntry = await resolveActiveModelEntry(params);
|
||||
if (activeEntry) {
|
||||
return [activeEntry];
|
||||
}
|
||||
if (params.capability === "audio") {
|
||||
const localAudio = await resolveLocalAudioEntry();
|
||||
if (localAudio) {
|
||||
return [localAudio];
|
||||
}
|
||||
}
|
||||
if (params.capability === "image") {
|
||||
const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg);
|
||||
if (imageModelEntries.length > 0) {
|
||||
return imageModelEntries;
|
||||
}
|
||||
}
|
||||
const gemini = await resolveGeminiCliEntry(params.capability);
|
||||
if (gemini) {
|
||||
return [gemini];
|
||||
}
|
||||
const keys = await resolveKeyEntry(params);
|
||||
if (keys) {
|
||||
return [keys];
|
||||
}
|
||||
return [];
|
||||
clearExtensionHostMediaUnderstandingBinaryCacheForTests();
|
||||
}
|
||||
|
||||
export async function resolveAutoImageModel(params: {
|
||||
|
|
@ -489,171 +73,10 @@ export async function resolveAutoImageModel(params: {
|
|||
agentDir?: string;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<ActiveMediaModel | null> {
|
||||
const providerRegistry = buildProviderRegistry();
|
||||
const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => {
|
||||
if (!entry || entry.type === "cli") {
|
||||
return null;
|
||||
}
|
||||
const provider = entry.provider;
|
||||
if (!provider) {
|
||||
return null;
|
||||
}
|
||||
const model = entry.model ?? DEFAULT_IMAGE_MODELS[provider];
|
||||
if (!model) {
|
||||
return null;
|
||||
}
|
||||
return { provider, model };
|
||||
};
|
||||
const activeEntry = await resolveActiveModelEntry({
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry,
|
||||
capability: "image",
|
||||
activeModel: params.activeModel,
|
||||
return await resolveExtensionHostAutoImageModel({
|
||||
...params,
|
||||
providerRegistry: buildProviderRegistry(),
|
||||
});
|
||||
const resolvedActive = toActive(activeEntry);
|
||||
if (resolvedActive) {
|
||||
return resolvedActive;
|
||||
}
|
||||
const keyEntry = await resolveKeyEntry({
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry,
|
||||
capability: "image",
|
||||
activeModel: params.activeModel,
|
||||
});
|
||||
return toActive(keyEntry);
|
||||
}
|
||||
|
||||
async function resolveActiveModelEntry(params: {
|
||||
cfg: OpenClawConfig;
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
capability: MediaUnderstandingCapability;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<MediaUnderstandingModelConfig | null> {
|
||||
const activeProviderRaw = params.activeModel?.provider?.trim();
|
||||
if (!activeProviderRaw) {
|
||||
return null;
|
||||
}
|
||||
const providerId = normalizeExtensionHostMediaProviderId(activeProviderRaw);
|
||||
if (!providerId) {
|
||||
return null;
|
||||
}
|
||||
const provider = getExtensionHostMediaUnderstandingProvider(providerId, params.providerRegistry);
|
||||
if (!provider) {
|
||||
return null;
|
||||
}
|
||||
if (params.capability === "audio" && !provider.transcribeAudio) {
|
||||
return null;
|
||||
}
|
||||
if (params.capability === "image" && !provider.describeImage) {
|
||||
return null;
|
||||
}
|
||||
if (params.capability === "video" && !provider.describeVideo) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
await resolveApiKeyForProvider({
|
||||
provider: providerId,
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
type: "provider",
|
||||
provider: providerId,
|
||||
model: params.activeModel?.model,
|
||||
};
|
||||
}
|
||||
|
||||
async function runAttachmentEntries(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
cfg: OpenClawConfig;
|
||||
ctx: MsgContext;
|
||||
attachmentIndex: number;
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
cache: MediaAttachmentCache;
|
||||
entries: MediaUnderstandingModelConfig[];
|
||||
config?: MediaUnderstandingConfig;
|
||||
}): Promise<{
|
||||
output: MediaUnderstandingOutput | null;
|
||||
attempts: MediaUnderstandingModelDecision[];
|
||||
}> {
|
||||
const { entries, capability } = params;
|
||||
const attempts: MediaUnderstandingModelDecision[] = [];
|
||||
for (const entry of entries) {
|
||||
const entryType = entry.type ?? (entry.command ? "cli" : "provider");
|
||||
try {
|
||||
const result =
|
||||
entryType === "cli"
|
||||
? await runCliEntry({
|
||||
capability,
|
||||
entry,
|
||||
cfg: params.cfg,
|
||||
ctx: params.ctx,
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
cache: params.cache,
|
||||
config: params.config,
|
||||
})
|
||||
: await runProviderEntry({
|
||||
capability,
|
||||
entry,
|
||||
cfg: params.cfg,
|
||||
ctx: params.ctx,
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
cache: params.cache,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
config: params.config,
|
||||
});
|
||||
if (result) {
|
||||
const decision = buildModelDecision({ entry, entryType, outcome: "success" });
|
||||
if (result.provider) {
|
||||
decision.provider = result.provider;
|
||||
}
|
||||
if (result.model) {
|
||||
decision.model = result.model;
|
||||
}
|
||||
attempts.push(decision);
|
||||
return { output: result, attempts };
|
||||
}
|
||||
attempts.push(
|
||||
buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }),
|
||||
);
|
||||
} catch (err) {
|
||||
if (isMediaUnderstandingSkipError(err)) {
|
||||
attempts.push(
|
||||
buildModelDecision({
|
||||
entry,
|
||||
entryType,
|
||||
outcome: "skipped",
|
||||
reason: `${err.reason}: ${err.message}`,
|
||||
}),
|
||||
);
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
attempts.push(
|
||||
buildModelDecision({
|
||||
entry,
|
||||
entryType,
|
||||
outcome: "failed",
|
||||
reason: String(err),
|
||||
}),
|
||||
);
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`${capability} understanding failed: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { output: null, attempts };
|
||||
}
|
||||
|
||||
export async function runCapability(params: {
|
||||
|
|
@ -667,139 +90,5 @@ export async function runCapability(params: {
|
|||
config?: MediaUnderstandingConfig;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<RunCapabilityResult> {
|
||||
const { capability, cfg, ctx } = params;
|
||||
const config = params.config ?? cfg.tools?.media?.[capability];
|
||||
if (config?.enabled === false) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: { capability, outcome: "disabled", attachments: [] },
|
||||
};
|
||||
}
|
||||
|
||||
const attachmentPolicy = config?.attachments;
|
||||
const selected = selectAttachments({
|
||||
capability,
|
||||
attachments: params.media,
|
||||
policy: attachmentPolicy,
|
||||
});
|
||||
if (selected.length === 0) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: { capability, outcome: "no-attachment", attachments: [] },
|
||||
};
|
||||
}
|
||||
|
||||
const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx });
|
||||
if (scopeDecision === "deny") {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`${capability} understanding disabled by scope policy.`);
|
||||
}
|
||||
return {
|
||||
outputs: [],
|
||||
decision: {
|
||||
capability,
|
||||
outcome: "scope-deny",
|
||||
attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Skip image understanding when the primary model supports vision natively.
|
||||
// The image will be injected directly into the model context instead.
|
||||
const activeProvider = params.activeModel?.provider?.trim();
|
||||
if (capability === "image" && activeProvider) {
|
||||
const catalog = await loadModelCatalog({ config: cfg });
|
||||
const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
|
||||
if (modelSupportsVision(entry)) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose("Skipping image understanding: primary model supports vision natively");
|
||||
}
|
||||
const model = params.activeModel?.model?.trim();
|
||||
const reason = "primary model supports vision natively";
|
||||
return {
|
||||
outputs: [],
|
||||
decision: {
|
||||
capability,
|
||||
outcome: "skipped",
|
||||
attachments: selected.map((item) => {
|
||||
const attempt = {
|
||||
type: "provider" as const,
|
||||
provider: activeProvider,
|
||||
model: model || undefined,
|
||||
outcome: "skipped" as const,
|
||||
reason,
|
||||
};
|
||||
return {
|
||||
attachmentIndex: item.index,
|
||||
attempts: [attempt],
|
||||
chosen: attempt,
|
||||
};
|
||||
}),
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const entries = resolveModelEntries({
|
||||
cfg,
|
||||
capability,
|
||||
config,
|
||||
providerRegistry: params.providerRegistry,
|
||||
});
|
||||
let resolvedEntries = entries;
|
||||
if (resolvedEntries.length === 0) {
|
||||
resolvedEntries = await resolveAutoEntries({
|
||||
cfg,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
capability,
|
||||
activeModel: params.activeModel,
|
||||
});
|
||||
}
|
||||
if (resolvedEntries.length === 0) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: {
|
||||
capability,
|
||||
outcome: "skipped",
|
||||
attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const outputs: MediaUnderstandingOutput[] = [];
|
||||
const attachmentDecisions: MediaUnderstandingDecision["attachments"] = [];
|
||||
for (const attachment of selected) {
|
||||
const { output, attempts } = await runAttachmentEntries({
|
||||
capability,
|
||||
cfg,
|
||||
ctx,
|
||||
attachmentIndex: attachment.index,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
cache: params.attachments,
|
||||
entries: resolvedEntries,
|
||||
config,
|
||||
});
|
||||
if (output) {
|
||||
outputs.push(output);
|
||||
}
|
||||
attachmentDecisions.push({
|
||||
attachmentIndex: attachment.index,
|
||||
attempts,
|
||||
chosen: attempts.find((attempt) => attempt.outcome === "success"),
|
||||
});
|
||||
}
|
||||
const decision: MediaUnderstandingDecision = {
|
||||
capability,
|
||||
outcome: outputs.length > 0 ? "success" : "skipped",
|
||||
attachments: attachmentDecisions,
|
||||
};
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Media understanding ${formatDecisionSummary(decision)}`);
|
||||
}
|
||||
return {
|
||||
outputs,
|
||||
decision,
|
||||
};
|
||||
return await runExtensionHostMediaCapability(params);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue