From 37ee04e9b0fdd95b0bb28205086523e890cee5e4 Mon Sep 17 00:00:00 2001 From: Gustavo Madeira Santana Date: Sun, 15 Mar 2026 19:52:50 +0000 Subject: [PATCH] Media: extract runtime auto and orchestration --- src/agents/model-auth.ts | 2 +- src/extension-host/media-runtime-auto.test.ts | 52 ++ src/extension-host/media-runtime-auto.ts | 499 ++++++++++++ .../media-runtime-orchestration.test.ts | 58 ++ .../media-runtime-orchestration.ts | 272 +++++++ src/media-understanding/runner.ts | 745 +----------------- 6 files changed, 899 insertions(+), 729 deletions(-) create mode 100644 src/extension-host/media-runtime-auto.test.ts create mode 100644 src/extension-host/media-runtime-auto.ts create mode 100644 src/extension-host/media-runtime-orchestration.test.ts create mode 100644 src/extension-host/media-runtime-orchestration.ts diff --git a/src/agents/model-auth.ts b/src/agents/model-auth.ts index fb3abd1571e..19f591c2fe1 100644 --- a/src/agents/model-auth.ts +++ b/src/agents/model-auth.ts @@ -25,7 +25,7 @@ import { isNonSecretApiKeyMarker, OLLAMA_LOCAL_AUTH_MARKER, } from "./model-auth-markers.js"; -import { normalizeProviderId } from "./model-selection.js"; +import { normalizeProviderId } from "./provider-id.js"; export { ensureAuthProfileStore, resolveAuthProfileOrder } from "./auth-profiles.js"; diff --git a/src/extension-host/media-runtime-auto.test.ts b/src/extension-host/media-runtime-auto.test.ts new file mode 100644 index 00000000000..e33897e8431 --- /dev/null +++ b/src/extension-host/media-runtime-auto.test.ts @@ -0,0 +1,52 @@ +import { describe, expect, it } from "vitest"; +import type { OpenClawConfig } from "../config/config.js"; +import { DEFAULT_IMAGE_MODELS } from "../media-understanding/defaults.js"; +import { resolveAutoImageModel } from "./media-runtime-auto.js"; +import { buildExtensionHostMediaUnderstandingRegistry } from "./media-runtime-registry.js"; + +function createImageCfg(): OpenClawConfig { + return { + models: { + providers: { + openai: { + apiKey: "test-key", + models: [], + }, + }, + }, + } as unknown as OpenClawConfig; +} + +describe("media runtime auto image model", () => { + it("keeps a valid active image model", async () => { + const result = await resolveAutoImageModel({ + cfg: createImageCfg(), + providerRegistry: buildExtensionHostMediaUnderstandingRegistry(), + activeModel: { + provider: "openai", + model: "gpt-4.1-mini", + }, + }); + + expect(result).toEqual({ + provider: "openai", + model: "gpt-4.1-mini", + }); + }); + + it("falls back to the default keyed image model when the active model cannot be used", async () => { + const result = await resolveAutoImageModel({ + cfg: createImageCfg(), + providerRegistry: buildExtensionHostMediaUnderstandingRegistry(), + activeModel: { + provider: "missing-provider", + model: "ignored", + }, + }); + + expect(result).toEqual({ + provider: "openai", + model: DEFAULT_IMAGE_MODELS.openai, + }); + }); +}); diff --git a/src/extension-host/media-runtime-auto.ts b/src/extension-host/media-runtime-auto.ts new file mode 100644 index 00000000000..508666f14eb --- /dev/null +++ b/src/extension-host/media-runtime-auto.ts @@ -0,0 +1,499 @@ +import { constants as fsConstants } from "node:fs"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { resolveApiKeyForProvider } from "../agents/model-auth.js"; +import type { OpenClawConfig } from "../config/config.js"; +import { + resolveAgentModelFallbackValues, + resolveAgentModelPrimaryValue, +} from "../config/model-input.js"; +import type { MediaUnderstandingModelConfig } from "../config/types.tools.js"; +import { + getExtensionHostMediaUnderstandingProvider, + normalizeExtensionHostMediaProviderId, + type ExtensionHostMediaUnderstandingProviderRegistry, +} from "../extension-host/media-runtime-registry.js"; +import { + AUTO_AUDIO_KEY_PROVIDERS, + AUTO_IMAGE_KEY_PROVIDERS, + AUTO_VIDEO_KEY_PROVIDERS, + DEFAULT_IMAGE_MODELS, +} from "../media-understanding/defaults.js"; +import { fileExists } from "../media-understanding/fs.js"; +import { extractGeminiResponse } from "../media-understanding/output-extract.js"; +import type { MediaUnderstandingCapability } from "../media-understanding/types.js"; +import { runExec } from "../process/exec.js"; + +export type ActiveMediaModel = { + provider: string; + model?: string; +}; + +type ProviderRegistry = ExtensionHostMediaUnderstandingProviderRegistry; + +const binaryCache = new Map>(); +const geminiProbeCache = new Map>(); + +export function clearMediaUnderstandingBinaryCacheForTests(): void { + binaryCache.clear(); + geminiProbeCache.clear(); +} + +function expandHomeDir(value: string): string { + if (!value.startsWith("~")) { + return value; + } + const home = os.homedir(); + if (value === "~") { + return home; + } + if (value.startsWith("~/")) { + return path.join(home, value.slice(2)); + } + return value; +} + +function hasPathSeparator(value: string): boolean { + return value.includes("/") || value.includes("\\"); +} + +function candidateBinaryNames(name: string): string[] { + if (process.platform !== "win32") { + return [name]; + } + const ext = path.extname(name); + if (ext) { + return [name]; + } + const pathext = (process.env.PATHEXT ?? ".EXE;.CMD;.BAT;.COM") + .split(";") + .map((item) => item.trim()) + .filter(Boolean) + .map((item) => (item.startsWith(".") ? item : `.${item}`)); + const unique = Array.from(new Set(pathext)); + return [name, ...unique.map((item) => `${name}${item}`)]; +} + +async function isExecutable(filePath: string): Promise { + try { + const stat = await fs.stat(filePath); + if (!stat.isFile()) { + return false; + } + if (process.platform === "win32") { + return true; + } + await fs.access(filePath, fsConstants.X_OK); + return true; + } catch { + return false; + } +} + +async function findBinary(name: string): Promise { + const cached = binaryCache.get(name); + if (cached) { + return cached; + } + const resolved = (async () => { + const direct = expandHomeDir(name.trim()); + if (direct && hasPathSeparator(direct)) { + for (const candidate of candidateBinaryNames(direct)) { + if (await isExecutable(candidate)) { + return candidate; + } + } + } + + const searchName = name.trim(); + if (!searchName) { + return null; + } + const pathEntries = (process.env.PATH ?? "").split(path.delimiter); + const candidates = candidateBinaryNames(searchName); + for (const entryRaw of pathEntries) { + const entry = expandHomeDir(entryRaw.trim().replace(/^"(.*)"$/, "$1")); + if (!entry) { + continue; + } + for (const candidate of candidates) { + const fullPath = path.join(entry, candidate); + if (await isExecutable(fullPath)) { + return fullPath; + } + } + } + + return null; + })(); + binaryCache.set(name, resolved); + return resolved; +} + +async function hasBinary(name: string): Promise { + return Boolean(await findBinary(name)); +} + +async function probeGeminiCli(): Promise { + const cached = geminiProbeCache.get("gemini"); + if (cached) { + return cached; + } + const resolved = (async () => { + if (!(await hasBinary("gemini"))) { + return false; + } + try { + const { stdout } = await runExec("gemini", ["--output-format", "json", "ok"], { + timeoutMs: 8000, + }); + return Boolean(extractGeminiResponse(stdout) ?? stdout.toLowerCase().includes("ok")); + } catch { + return false; + } + })(); + geminiProbeCache.set("gemini", resolved); + return resolved; +} + +async function resolveLocalWhisperCppEntry(): Promise { + if (!(await hasBinary("whisper-cli"))) { + return null; + } + const envModel = process.env.WHISPER_CPP_MODEL?.trim(); + const defaultModel = "/opt/homebrew/share/whisper-cpp/for-tests-ggml-tiny.bin"; + const modelPath = envModel && (await fileExists(envModel)) ? envModel : defaultModel; + if (!(await fileExists(modelPath))) { + return null; + } + return { + type: "cli", + command: "whisper-cli", + args: ["-m", modelPath, "-otxt", "-of", "{{OutputBase}}", "-np", "-nt", "{{MediaPath}}"], + }; +} + +async function resolveLocalWhisperEntry(): Promise { + if (!(await hasBinary("whisper"))) { + return null; + } + return { + type: "cli", + command: "whisper", + args: [ + "--model", + "turbo", + "--output_format", + "txt", + "--output_dir", + "{{OutputDir}}", + "--verbose", + "False", + "{{MediaPath}}", + ], + }; +} + +async function resolveSherpaOnnxEntry(): Promise { + if (!(await hasBinary("sherpa-onnx-offline"))) { + return null; + } + const modelDir = process.env.SHERPA_ONNX_MODEL_DIR?.trim(); + if (!modelDir) { + return null; + } + const tokens = path.join(modelDir, "tokens.txt"); + const encoder = path.join(modelDir, "encoder.onnx"); + const decoder = path.join(modelDir, "decoder.onnx"); + const joiner = path.join(modelDir, "joiner.onnx"); + if (!(await fileExists(tokens))) { + return null; + } + if (!(await fileExists(encoder))) { + return null; + } + if (!(await fileExists(decoder))) { + return null; + } + if (!(await fileExists(joiner))) { + return null; + } + return { + type: "cli", + command: "sherpa-onnx-offline", + args: [ + `--tokens=${tokens}`, + `--encoder=${encoder}`, + `--decoder=${decoder}`, + `--joiner=${joiner}`, + "{{MediaPath}}", + ], + }; +} + +async function resolveLocalAudioEntry(): Promise { + const sherpa = await resolveSherpaOnnxEntry(); + if (sherpa) { + return sherpa; + } + const whisperCpp = await resolveLocalWhisperCppEntry(); + if (whisperCpp) { + return whisperCpp; + } + return await resolveLocalWhisperEntry(); +} + +async function resolveGeminiCliEntry( + _capability: MediaUnderstandingCapability, +): Promise { + if (!(await probeGeminiCli())) { + return null; + } + return { + type: "cli", + command: "gemini", + args: [ + "--output-format", + "json", + "--allowed-tools", + "read_many_files", + "--include-directories", + "{{MediaDir}}", + "{{Prompt}}", + "Use read_many_files to read {{MediaPath}} and respond with only the text output.", + ], + }; +} + +async function resolveActiveModelEntry(params: { + cfg: OpenClawConfig; + agentDir?: string; + providerRegistry: ProviderRegistry; + capability: MediaUnderstandingCapability; + activeModel?: ActiveMediaModel; +}): Promise { + const activeProviderRaw = params.activeModel?.provider?.trim(); + if (!activeProviderRaw) { + return null; + } + const providerId = normalizeExtensionHostMediaProviderId(activeProviderRaw); + if (!providerId) { + return null; + } + const provider = getExtensionHostMediaUnderstandingProvider(providerId, params.providerRegistry); + if (!provider) { + return null; + } + if (params.capability === "audio" && !provider.transcribeAudio) { + return null; + } + if (params.capability === "image" && !provider.describeImage) { + return null; + } + if (params.capability === "video" && !provider.describeVideo) { + return null; + } + try { + await resolveApiKeyForProvider({ + provider: providerId, + cfg: params.cfg, + agentDir: params.agentDir, + }); + } catch { + return null; + } + return { + type: "provider", + provider: providerId, + model: params.activeModel?.model, + }; +} + +async function resolveKeyEntry(params: { + cfg: OpenClawConfig; + agentDir?: string; + providerRegistry: ProviderRegistry; + capability: MediaUnderstandingCapability; + activeModel?: ActiveMediaModel; +}): Promise { + const { cfg, agentDir, providerRegistry, capability } = params; + const checkProvider = async ( + providerId: string, + model?: string, + ): Promise => { + const provider = getExtensionHostMediaUnderstandingProvider(providerId, providerRegistry); + if (!provider) { + return null; + } + if (capability === "audio" && !provider.transcribeAudio) { + return null; + } + if (capability === "image" && !provider.describeImage) { + return null; + } + if (capability === "video" && !provider.describeVideo) { + return null; + } + try { + await resolveApiKeyForProvider({ provider: providerId, cfg, agentDir }); + return { type: "provider", provider: providerId, model }; + } catch { + return null; + } + }; + + if (capability === "image") { + const activeProvider = params.activeModel?.provider?.trim(); + if (activeProvider) { + const activeEntry = await checkProvider(activeProvider, params.activeModel?.model); + if (activeEntry) { + return activeEntry; + } + } + for (const providerId of AUTO_IMAGE_KEY_PROVIDERS) { + const model = DEFAULT_IMAGE_MODELS[providerId]; + const entry = await checkProvider(providerId, model); + if (entry) { + return entry; + } + } + return null; + } + + if (capability === "video") { + const activeProvider = params.activeModel?.provider?.trim(); + if (activeProvider) { + const activeEntry = await checkProvider(activeProvider, params.activeModel?.model); + if (activeEntry) { + return activeEntry; + } + } + for (const providerId of AUTO_VIDEO_KEY_PROVIDERS) { + const entry = await checkProvider(providerId, undefined); + if (entry) { + return entry; + } + } + return null; + } + + const activeProvider = params.activeModel?.provider?.trim(); + if (activeProvider) { + const activeEntry = await checkProvider(activeProvider, params.activeModel?.model); + if (activeEntry) { + return activeEntry; + } + } + for (const providerId of AUTO_AUDIO_KEY_PROVIDERS) { + const entry = await checkProvider(providerId, undefined); + if (entry) { + return entry; + } + } + return null; +} + +function resolveImageModelFromAgentDefaults(cfg: OpenClawConfig): MediaUnderstandingModelConfig[] { + const refs: string[] = []; + const primary = resolveAgentModelPrimaryValue(cfg.agents?.defaults?.imageModel); + if (primary?.trim()) { + refs.push(primary.trim()); + } + for (const fb of resolveAgentModelFallbackValues(cfg.agents?.defaults?.imageModel)) { + if (fb?.trim()) { + refs.push(fb.trim()); + } + } + if (refs.length === 0) { + return []; + } + const entries: MediaUnderstandingModelConfig[] = []; + for (const ref of refs) { + const slashIdx = ref.indexOf("/"); + if (slashIdx <= 0 || slashIdx >= ref.length - 1) { + continue; + } + entries.push({ + type: "provider", + provider: ref.slice(0, slashIdx), + model: ref.slice(slashIdx + 1), + }); + } + return entries; +} + +export async function resolveAutoEntries(params: { + cfg: OpenClawConfig; + agentDir?: string; + providerRegistry: ProviderRegistry; + capability: MediaUnderstandingCapability; + activeModel?: ActiveMediaModel; +}): Promise { + const activeEntry = await resolveActiveModelEntry(params); + if (activeEntry) { + return [activeEntry]; + } + if (params.capability === "audio") { + const localAudio = await resolveLocalAudioEntry(); + if (localAudio) { + return [localAudio]; + } + } + if (params.capability === "image") { + const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg); + if (imageModelEntries.length > 0) { + return imageModelEntries; + } + } + const gemini = await resolveGeminiCliEntry(params.capability); + if (gemini) { + return [gemini]; + } + const keys = await resolveKeyEntry(params); + if (keys) { + return [keys]; + } + return []; +} + +export async function resolveAutoImageModel(params: { + cfg: OpenClawConfig; + agentDir?: string; + activeModel?: ActiveMediaModel; + providerRegistry: ProviderRegistry; +}): Promise { + const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => { + if (!entry || entry.type === "cli") { + return null; + } + const provider = entry.provider; + if (!provider) { + return null; + } + const model = entry.model ?? DEFAULT_IMAGE_MODELS[provider]; + if (!model) { + return null; + } + return { provider, model }; + }; + const activeEntry = await resolveActiveModelEntry({ + cfg: params.cfg, + agentDir: params.agentDir, + providerRegistry: params.providerRegistry, + capability: "image", + activeModel: params.activeModel, + }); + const resolvedActive = toActive(activeEntry); + if (resolvedActive) { + return resolvedActive; + } + const keyEntry = await resolveKeyEntry({ + cfg: params.cfg, + agentDir: params.agentDir, + providerRegistry: params.providerRegistry, + capability: "image", + activeModel: params.activeModel, + }); + return toActive(keyEntry); +} diff --git a/src/extension-host/media-runtime-orchestration.test.ts b/src/extension-host/media-runtime-orchestration.test.ts new file mode 100644 index 00000000000..bb02eeac30c --- /dev/null +++ b/src/extension-host/media-runtime-orchestration.test.ts @@ -0,0 +1,58 @@ +import { describe, expect, it, vi } from "vitest"; +import type { MsgContext } from "../auto-reply/templating.js"; +import type { OpenClawConfig } from "../config/config.js"; +import { + createMediaAttachmentCache, + normalizeMediaAttachments, +} from "../media-understanding/runner.js"; +import { runCapability } from "./media-runtime-orchestration.js"; +import { buildExtensionHostMediaUnderstandingRegistry } from "./media-runtime-registry.js"; + +const catalog = [ + { + id: "gpt-4.1", + name: "GPT-4.1", + provider: "openai", + input: ["text", "image"] as const, + }, +]; + +vi.mock("../agents/model-catalog.js", async () => { + const actual = await vi.importActual( + "../agents/model-catalog.js", + ); + return { + ...actual, + loadModelCatalog: vi.fn(async () => catalog), + }; +}); + +describe("media runtime orchestration", () => { + it("skips image understanding when the active model already supports vision", async () => { + const ctx: MsgContext = { MediaPath: "/tmp/image.png", MediaType: "image/png" }; + const media = normalizeMediaAttachments(ctx); + const cache = createMediaAttachmentCache(media); + const cfg = {} as OpenClawConfig; + + try { + const result = await runCapability({ + capability: "image", + cfg, + ctx, + attachments: cache, + media, + providerRegistry: buildExtensionHostMediaUnderstandingRegistry(), + activeModel: { provider: "openai", model: "gpt-4.1" }, + }); + + expect(result.outputs).toHaveLength(0); + expect(result.decision.outcome).toBe("skipped"); + expect(result.decision.attachments).toHaveLength(1); + expect(result.decision.attachments[0]?.attempts[0]?.reason).toBe( + "primary model supports vision natively", + ); + } finally { + await cache.cleanup(); + } + }); +}); diff --git a/src/extension-host/media-runtime-orchestration.ts b/src/extension-host/media-runtime-orchestration.ts new file mode 100644 index 00000000000..79161068de7 --- /dev/null +++ b/src/extension-host/media-runtime-orchestration.ts @@ -0,0 +1,272 @@ +import { + findModelInCatalog, + loadModelCatalog, + modelSupportsVision, +} from "../agents/model-catalog.js"; +import type { MsgContext } from "../auto-reply/templating.js"; +import type { OpenClawConfig } from "../config/config.js"; +import type { + MediaUnderstandingConfig, + MediaUnderstandingModelConfig, +} from "../config/types.tools.js"; +import { logVerbose, shouldLogVerbose } from "../globals.js"; +import { MediaAttachmentCache, selectAttachments } from "../media-understanding/attachments.js"; +import { isMediaUnderstandingSkipError } from "../media-understanding/errors.js"; +import { resolveModelEntries, resolveScopeDecision } from "../media-understanding/resolve.js"; +import { + buildModelDecision, + formatDecisionSummary, + runCliEntry, + runProviderEntry, +} from "../media-understanding/runner.entries.js"; +import type { + MediaAttachment, + MediaUnderstandingCapability, + MediaUnderstandingDecision, + MediaUnderstandingModelDecision, + MediaUnderstandingOutput, + MediaUnderstandingProvider, +} from "../media-understanding/types.js"; +import { resolveAutoEntries, type ActiveMediaModel } from "./media-runtime-auto.js"; + +type ProviderRegistry = Map; + +export type RunCapabilityResult = { + outputs: MediaUnderstandingOutput[]; + decision: MediaUnderstandingDecision; +}; + +async function runAttachmentEntries(params: { + capability: MediaUnderstandingCapability; + cfg: OpenClawConfig; + ctx: MsgContext; + attachmentIndex: number; + agentDir?: string; + providerRegistry: ProviderRegistry; + cache: MediaAttachmentCache; + entries: MediaUnderstandingModelConfig[]; + config?: MediaUnderstandingConfig; +}): Promise<{ + output: MediaUnderstandingOutput | null; + attempts: MediaUnderstandingModelDecision[]; +}> { + const { entries, capability } = params; + const attempts: MediaUnderstandingModelDecision[] = []; + for (const entry of entries) { + const entryType = entry.type ?? (entry.command ? "cli" : "provider"); + try { + const result = + entryType === "cli" + ? await runCliEntry({ + capability, + entry, + cfg: params.cfg, + ctx: params.ctx, + attachmentIndex: params.attachmentIndex, + cache: params.cache, + config: params.config, + }) + : await runProviderEntry({ + capability, + entry, + cfg: params.cfg, + ctx: params.ctx, + attachmentIndex: params.attachmentIndex, + cache: params.cache, + agentDir: params.agentDir, + providerRegistry: params.providerRegistry, + config: params.config, + }); + if (result) { + const decision = buildModelDecision({ entry, entryType, outcome: "success" }); + if (result.provider) { + decision.provider = result.provider; + } + if (result.model) { + decision.model = result.model; + } + attempts.push(decision); + return { output: result, attempts }; + } + attempts.push( + buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }), + ); + } catch (err) { + if (isMediaUnderstandingSkipError(err)) { + attempts.push( + buildModelDecision({ + entry, + entryType, + outcome: "skipped", + reason: `${err.reason}: ${err.message}`, + }), + ); + if (shouldLogVerbose()) { + logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`); + } + continue; + } + attempts.push( + buildModelDecision({ + entry, + entryType, + outcome: "failed", + reason: String(err), + }), + ); + if (shouldLogVerbose()) { + logVerbose(`${capability} understanding failed: ${String(err)}`); + } + } + } + + return { output: null, attempts }; +} + +export async function runCapability(params: { + capability: MediaUnderstandingCapability; + cfg: OpenClawConfig; + ctx: MsgContext; + attachments: MediaAttachmentCache; + media: MediaAttachment[]; + agentDir?: string; + providerRegistry: ProviderRegistry; + config?: MediaUnderstandingConfig; + activeModel?: ActiveMediaModel; +}): Promise { + const { capability, cfg, ctx } = params; + const config = params.config ?? cfg.tools?.media?.[capability]; + if (config?.enabled === false) { + return { + outputs: [], + decision: { capability, outcome: "disabled", attachments: [] }, + }; + } + + const attachmentPolicy = config?.attachments; + const selected = selectAttachments({ + capability, + attachments: params.media, + policy: attachmentPolicy, + }); + if (selected.length === 0) { + return { + outputs: [], + decision: { capability, outcome: "no-attachment", attachments: [] }, + }; + } + + const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx }); + if (scopeDecision === "deny") { + if (shouldLogVerbose()) { + logVerbose(`${capability} understanding disabled by scope policy.`); + } + return { + outputs: [], + decision: { + capability, + outcome: "scope-deny", + attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })), + }, + }; + } + + // Skip image understanding when the primary model supports vision natively. + // The image will be injected directly into the model context instead. + const activeProvider = params.activeModel?.provider?.trim(); + if (capability === "image" && activeProvider) { + const catalog = await loadModelCatalog({ config: cfg }); + const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? ""); + if (modelSupportsVision(entry)) { + if (shouldLogVerbose()) { + logVerbose("Skipping image understanding: primary model supports vision natively"); + } + const model = params.activeModel?.model?.trim(); + const reason = "primary model supports vision natively"; + return { + outputs: [], + decision: { + capability, + outcome: "skipped", + attachments: selected.map((item) => { + const attempt = { + type: "provider" as const, + provider: activeProvider, + model: model || undefined, + outcome: "skipped" as const, + reason, + }; + return { + attachmentIndex: item.index, + attempts: [attempt], + chosen: attempt, + }; + }), + }, + }; + } + } + + const entries = resolveModelEntries({ + cfg, + capability, + config, + providerRegistry: params.providerRegistry, + }); + let resolvedEntries = entries; + if (resolvedEntries.length === 0) { + resolvedEntries = await resolveAutoEntries({ + cfg, + agentDir: params.agentDir, + providerRegistry: params.providerRegistry, + capability, + activeModel: params.activeModel, + }); + } + if (resolvedEntries.length === 0) { + return { + outputs: [], + decision: { + capability, + outcome: "skipped", + attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })), + }, + }; + } + + const outputs: MediaUnderstandingOutput[] = []; + const attachmentDecisions: MediaUnderstandingDecision["attachments"] = []; + for (const attachment of selected) { + const { output, attempts } = await runAttachmentEntries({ + capability, + cfg, + ctx, + attachmentIndex: attachment.index, + agentDir: params.agentDir, + providerRegistry: params.providerRegistry, + cache: params.attachments, + entries: resolvedEntries, + config, + }); + if (output) { + outputs.push(output); + } + attachmentDecisions.push({ + attachmentIndex: attachment.index, + attempts, + chosen: attempts.find((attempt) => attempt.outcome === "success"), + }); + } + const decision: MediaUnderstandingDecision = { + capability, + outcome: outputs.length > 0 ? "success" : "skipped", + attachments: attachmentDecisions, + }; + if (shouldLogVerbose()) { + logVerbose(`Media understanding ${formatDecisionSummary(decision)}`); + } + return { + outputs, + decision, + }; +} diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index f4b9e09d13c..4cf2a923e9c 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -1,77 +1,38 @@ -import { constants as fsConstants } from "node:fs"; -import fs from "node:fs/promises"; -import os from "node:os"; -import path from "node:path"; -import { resolveApiKeyForProvider } from "../agents/model-auth.js"; -import { - findModelInCatalog, - loadModelCatalog, - modelSupportsVision, -} from "../agents/model-catalog.js"; import type { MsgContext } from "../auto-reply/templating.js"; import type { OpenClawConfig } from "../config/config.js"; +import type { MediaUnderstandingConfig } from "../config/types.tools.js"; import { - resolveAgentModelFallbackValues, - resolveAgentModelPrimaryValue, -} from "../config/model-input.js"; -import type { - MediaUnderstandingConfig, - MediaUnderstandingModelConfig, -} from "../config/types.tools.js"; + clearMediaUnderstandingBinaryCacheForTests as clearExtensionHostMediaUnderstandingBinaryCacheForTests, + resolveAutoImageModel as resolveExtensionHostAutoImageModel, + type ActiveMediaModel, +} from "../extension-host/media-runtime-auto.js"; +import { + runCapability as runExtensionHostMediaCapability, + type RunCapabilityResult, +} from "../extension-host/media-runtime-orchestration.js"; import { buildExtensionHostMediaUnderstandingRegistry, - getExtensionHostMediaUnderstandingProvider, - normalizeExtensionHostMediaProviderId, + type ExtensionHostMediaUnderstandingProviderRegistry, } from "../extension-host/media-runtime-registry.js"; -import { logVerbose, shouldLogVerbose } from "../globals.js"; import { mergeInboundPathRoots, resolveIMessageAttachmentRoots, } from "../media/inbound-path-policy.js"; import { getDefaultMediaLocalRoots } from "../media/local-roots.js"; -import { runExec } from "../process/exec.js"; import { MediaAttachmentCache, type MediaAttachmentCacheOptions, normalizeAttachments, - selectAttachments, } from "./attachments.js"; -import { - AUTO_AUDIO_KEY_PROVIDERS, - AUTO_IMAGE_KEY_PROVIDERS, - AUTO_VIDEO_KEY_PROVIDERS, - DEFAULT_IMAGE_MODELS, -} from "./defaults.js"; -import { isMediaUnderstandingSkipError } from "./errors.js"; -import { fileExists } from "./fs.js"; -import { extractGeminiResponse } from "./output-extract.js"; -import { resolveModelEntries, resolveScopeDecision } from "./resolve.js"; -import { - buildModelDecision, - formatDecisionSummary, - runCliEntry, - runProviderEntry, -} from "./runner.entries.js"; import type { MediaAttachment, MediaUnderstandingCapability, - MediaUnderstandingDecision, - MediaUnderstandingModelDecision, - MediaUnderstandingOutput, MediaUnderstandingProvider, } from "./types.js"; -export type ActiveMediaModel = { - provider: string; - model?: string; -}; +type ProviderRegistry = ExtensionHostMediaUnderstandingProviderRegistry; -type ProviderRegistry = Map; - -export type RunCapabilityResult = { - outputs: MediaUnderstandingOutput[]; - decision: MediaUnderstandingDecision; -}; +export type { ActiveMediaModel, RunCapabilityResult }; export function buildProviderRegistry( overrides?: Record, @@ -103,385 +64,8 @@ export function createMediaAttachmentCache( return new MediaAttachmentCache(attachments, options); } -const binaryCache = new Map>(); -const geminiProbeCache = new Map>(); - export function clearMediaUnderstandingBinaryCacheForTests(): void { - binaryCache.clear(); - geminiProbeCache.clear(); -} - -function expandHomeDir(value: string): string { - if (!value.startsWith("~")) { - return value; - } - const home = os.homedir(); - if (value === "~") { - return home; - } - if (value.startsWith("~/")) { - return path.join(home, value.slice(2)); - } - return value; -} - -function hasPathSeparator(value: string): boolean { - return value.includes("/") || value.includes("\\"); -} - -function candidateBinaryNames(name: string): string[] { - if (process.platform !== "win32") { - return [name]; - } - const ext = path.extname(name); - if (ext) { - return [name]; - } - const pathext = (process.env.PATHEXT ?? ".EXE;.CMD;.BAT;.COM") - .split(";") - .map((item) => item.trim()) - .filter(Boolean) - .map((item) => (item.startsWith(".") ? item : `.${item}`)); - const unique = Array.from(new Set(pathext)); - return [name, ...unique.map((item) => `${name}${item}`)]; -} - -async function isExecutable(filePath: string): Promise { - try { - const stat = await fs.stat(filePath); - if (!stat.isFile()) { - return false; - } - if (process.platform === "win32") { - return true; - } - await fs.access(filePath, fsConstants.X_OK); - return true; - } catch { - return false; - } -} - -async function findBinary(name: string): Promise { - const cached = binaryCache.get(name); - if (cached) { - return cached; - } - const resolved = (async () => { - const direct = expandHomeDir(name.trim()); - if (direct && hasPathSeparator(direct)) { - for (const candidate of candidateBinaryNames(direct)) { - if (await isExecutable(candidate)) { - return candidate; - } - } - } - - const searchName = name.trim(); - if (!searchName) { - return null; - } - const pathEntries = (process.env.PATH ?? "").split(path.delimiter); - const candidates = candidateBinaryNames(searchName); - for (const entryRaw of pathEntries) { - const entry = expandHomeDir(entryRaw.trim().replace(/^"(.*)"$/, "$1")); - if (!entry) { - continue; - } - for (const candidate of candidates) { - const fullPath = path.join(entry, candidate); - if (await isExecutable(fullPath)) { - return fullPath; - } - } - } - - return null; - })(); - binaryCache.set(name, resolved); - return resolved; -} - -async function hasBinary(name: string): Promise { - return Boolean(await findBinary(name)); -} - -async function probeGeminiCli(): Promise { - const cached = geminiProbeCache.get("gemini"); - if (cached) { - return cached; - } - const resolved = (async () => { - if (!(await hasBinary("gemini"))) { - return false; - } - try { - const { stdout } = await runExec("gemini", ["--output-format", "json", "ok"], { - timeoutMs: 8000, - }); - return Boolean(extractGeminiResponse(stdout) ?? stdout.toLowerCase().includes("ok")); - } catch { - return false; - } - })(); - geminiProbeCache.set("gemini", resolved); - return resolved; -} - -async function resolveLocalWhisperCppEntry(): Promise { - if (!(await hasBinary("whisper-cli"))) { - return null; - } - const envModel = process.env.WHISPER_CPP_MODEL?.trim(); - const defaultModel = "/opt/homebrew/share/whisper-cpp/for-tests-ggml-tiny.bin"; - const modelPath = envModel && (await fileExists(envModel)) ? envModel : defaultModel; - if (!(await fileExists(modelPath))) { - return null; - } - return { - type: "cli", - command: "whisper-cli", - args: ["-m", modelPath, "-otxt", "-of", "{{OutputBase}}", "-np", "-nt", "{{MediaPath}}"], - }; -} - -async function resolveLocalWhisperEntry(): Promise { - if (!(await hasBinary("whisper"))) { - return null; - } - return { - type: "cli", - command: "whisper", - args: [ - "--model", - "turbo", - "--output_format", - "txt", - "--output_dir", - "{{OutputDir}}", - "--verbose", - "False", - "{{MediaPath}}", - ], - }; -} - -async function resolveSherpaOnnxEntry(): Promise { - if (!(await hasBinary("sherpa-onnx-offline"))) { - return null; - } - const modelDir = process.env.SHERPA_ONNX_MODEL_DIR?.trim(); - if (!modelDir) { - return null; - } - const tokens = path.join(modelDir, "tokens.txt"); - const encoder = path.join(modelDir, "encoder.onnx"); - const decoder = path.join(modelDir, "decoder.onnx"); - const joiner = path.join(modelDir, "joiner.onnx"); - if (!(await fileExists(tokens))) { - return null; - } - if (!(await fileExists(encoder))) { - return null; - } - if (!(await fileExists(decoder))) { - return null; - } - if (!(await fileExists(joiner))) { - return null; - } - return { - type: "cli", - command: "sherpa-onnx-offline", - args: [ - `--tokens=${tokens}`, - `--encoder=${encoder}`, - `--decoder=${decoder}`, - `--joiner=${joiner}`, - "{{MediaPath}}", - ], - }; -} - -async function resolveLocalAudioEntry(): Promise { - const sherpa = await resolveSherpaOnnxEntry(); - if (sherpa) { - return sherpa; - } - const whisperCpp = await resolveLocalWhisperCppEntry(); - if (whisperCpp) { - return whisperCpp; - } - return await resolveLocalWhisperEntry(); -} - -async function resolveGeminiCliEntry( - _capability: MediaUnderstandingCapability, -): Promise { - if (!(await probeGeminiCli())) { - return null; - } - return { - type: "cli", - command: "gemini", - args: [ - "--output-format", - "json", - "--allowed-tools", - "read_many_files", - "--include-directories", - "{{MediaDir}}", - "{{Prompt}}", - "Use read_many_files to read {{MediaPath}} and respond with only the text output.", - ], - }; -} - -async function resolveKeyEntry(params: { - cfg: OpenClawConfig; - agentDir?: string; - providerRegistry: ProviderRegistry; - capability: MediaUnderstandingCapability; - activeModel?: ActiveMediaModel; -}): Promise { - const { cfg, agentDir, providerRegistry, capability } = params; - const checkProvider = async ( - providerId: string, - model?: string, - ): Promise => { - const provider = getExtensionHostMediaUnderstandingProvider(providerId, providerRegistry); - if (!provider) { - return null; - } - if (capability === "audio" && !provider.transcribeAudio) { - return null; - } - if (capability === "image" && !provider.describeImage) { - return null; - } - if (capability === "video" && !provider.describeVideo) { - return null; - } - try { - await resolveApiKeyForProvider({ provider: providerId, cfg, agentDir }); - return { type: "provider" as const, provider: providerId, model }; - } catch { - return null; - } - }; - - if (capability === "image") { - const activeProvider = params.activeModel?.provider?.trim(); - if (activeProvider) { - const activeEntry = await checkProvider(activeProvider, params.activeModel?.model); - if (activeEntry) { - return activeEntry; - } - } - for (const providerId of AUTO_IMAGE_KEY_PROVIDERS) { - const model = DEFAULT_IMAGE_MODELS[providerId]; - const entry = await checkProvider(providerId, model); - if (entry) { - return entry; - } - } - return null; - } - - if (capability === "video") { - const activeProvider = params.activeModel?.provider?.trim(); - if (activeProvider) { - const activeEntry = await checkProvider(activeProvider, params.activeModel?.model); - if (activeEntry) { - return activeEntry; - } - } - for (const providerId of AUTO_VIDEO_KEY_PROVIDERS) { - const entry = await checkProvider(providerId, undefined); - if (entry) { - return entry; - } - } - return null; - } - - const activeProvider = params.activeModel?.provider?.trim(); - if (activeProvider) { - const activeEntry = await checkProvider(activeProvider, params.activeModel?.model); - if (activeEntry) { - return activeEntry; - } - } - for (const providerId of AUTO_AUDIO_KEY_PROVIDERS) { - const entry = await checkProvider(providerId, undefined); - if (entry) { - return entry; - } - } - return null; -} - -function resolveImageModelFromAgentDefaults(cfg: OpenClawConfig): MediaUnderstandingModelConfig[] { - const refs: string[] = []; - const primary = resolveAgentModelPrimaryValue(cfg.agents?.defaults?.imageModel); - if (primary?.trim()) { - refs.push(primary.trim()); - } - for (const fb of resolveAgentModelFallbackValues(cfg.agents?.defaults?.imageModel)) { - if (fb?.trim()) { - refs.push(fb.trim()); - } - } - if (refs.length === 0) { - return []; - } - const entries: MediaUnderstandingModelConfig[] = []; - for (const ref of refs) { - const slashIdx = ref.indexOf("/"); - if (slashIdx <= 0 || slashIdx >= ref.length - 1) { - continue; - } - entries.push({ - type: "provider", - provider: ref.slice(0, slashIdx), - model: ref.slice(slashIdx + 1), - }); - } - return entries; -} - -async function resolveAutoEntries(params: { - cfg: OpenClawConfig; - agentDir?: string; - providerRegistry: ProviderRegistry; - capability: MediaUnderstandingCapability; - activeModel?: ActiveMediaModel; -}): Promise { - const activeEntry = await resolveActiveModelEntry(params); - if (activeEntry) { - return [activeEntry]; - } - if (params.capability === "audio") { - const localAudio = await resolveLocalAudioEntry(); - if (localAudio) { - return [localAudio]; - } - } - if (params.capability === "image") { - const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg); - if (imageModelEntries.length > 0) { - return imageModelEntries; - } - } - const gemini = await resolveGeminiCliEntry(params.capability); - if (gemini) { - return [gemini]; - } - const keys = await resolveKeyEntry(params); - if (keys) { - return [keys]; - } - return []; + clearExtensionHostMediaUnderstandingBinaryCacheForTests(); } export async function resolveAutoImageModel(params: { @@ -489,171 +73,10 @@ export async function resolveAutoImageModel(params: { agentDir?: string; activeModel?: ActiveMediaModel; }): Promise { - const providerRegistry = buildProviderRegistry(); - const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => { - if (!entry || entry.type === "cli") { - return null; - } - const provider = entry.provider; - if (!provider) { - return null; - } - const model = entry.model ?? DEFAULT_IMAGE_MODELS[provider]; - if (!model) { - return null; - } - return { provider, model }; - }; - const activeEntry = await resolveActiveModelEntry({ - cfg: params.cfg, - agentDir: params.agentDir, - providerRegistry, - capability: "image", - activeModel: params.activeModel, + return await resolveExtensionHostAutoImageModel({ + ...params, + providerRegistry: buildProviderRegistry(), }); - const resolvedActive = toActive(activeEntry); - if (resolvedActive) { - return resolvedActive; - } - const keyEntry = await resolveKeyEntry({ - cfg: params.cfg, - agentDir: params.agentDir, - providerRegistry, - capability: "image", - activeModel: params.activeModel, - }); - return toActive(keyEntry); -} - -async function resolveActiveModelEntry(params: { - cfg: OpenClawConfig; - agentDir?: string; - providerRegistry: ProviderRegistry; - capability: MediaUnderstandingCapability; - activeModel?: ActiveMediaModel; -}): Promise { - const activeProviderRaw = params.activeModel?.provider?.trim(); - if (!activeProviderRaw) { - return null; - } - const providerId = normalizeExtensionHostMediaProviderId(activeProviderRaw); - if (!providerId) { - return null; - } - const provider = getExtensionHostMediaUnderstandingProvider(providerId, params.providerRegistry); - if (!provider) { - return null; - } - if (params.capability === "audio" && !provider.transcribeAudio) { - return null; - } - if (params.capability === "image" && !provider.describeImage) { - return null; - } - if (params.capability === "video" && !provider.describeVideo) { - return null; - } - try { - await resolveApiKeyForProvider({ - provider: providerId, - cfg: params.cfg, - agentDir: params.agentDir, - }); - } catch { - return null; - } - return { - type: "provider", - provider: providerId, - model: params.activeModel?.model, - }; -} - -async function runAttachmentEntries(params: { - capability: MediaUnderstandingCapability; - cfg: OpenClawConfig; - ctx: MsgContext; - attachmentIndex: number; - agentDir?: string; - providerRegistry: ProviderRegistry; - cache: MediaAttachmentCache; - entries: MediaUnderstandingModelConfig[]; - config?: MediaUnderstandingConfig; -}): Promise<{ - output: MediaUnderstandingOutput | null; - attempts: MediaUnderstandingModelDecision[]; -}> { - const { entries, capability } = params; - const attempts: MediaUnderstandingModelDecision[] = []; - for (const entry of entries) { - const entryType = entry.type ?? (entry.command ? "cli" : "provider"); - try { - const result = - entryType === "cli" - ? await runCliEntry({ - capability, - entry, - cfg: params.cfg, - ctx: params.ctx, - attachmentIndex: params.attachmentIndex, - cache: params.cache, - config: params.config, - }) - : await runProviderEntry({ - capability, - entry, - cfg: params.cfg, - ctx: params.ctx, - attachmentIndex: params.attachmentIndex, - cache: params.cache, - agentDir: params.agentDir, - providerRegistry: params.providerRegistry, - config: params.config, - }); - if (result) { - const decision = buildModelDecision({ entry, entryType, outcome: "success" }); - if (result.provider) { - decision.provider = result.provider; - } - if (result.model) { - decision.model = result.model; - } - attempts.push(decision); - return { output: result, attempts }; - } - attempts.push( - buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }), - ); - } catch (err) { - if (isMediaUnderstandingSkipError(err)) { - attempts.push( - buildModelDecision({ - entry, - entryType, - outcome: "skipped", - reason: `${err.reason}: ${err.message}`, - }), - ); - if (shouldLogVerbose()) { - logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`); - } - continue; - } - attempts.push( - buildModelDecision({ - entry, - entryType, - outcome: "failed", - reason: String(err), - }), - ); - if (shouldLogVerbose()) { - logVerbose(`${capability} understanding failed: ${String(err)}`); - } - } - } - - return { output: null, attempts }; } export async function runCapability(params: { @@ -667,139 +90,5 @@ export async function runCapability(params: { config?: MediaUnderstandingConfig; activeModel?: ActiveMediaModel; }): Promise { - const { capability, cfg, ctx } = params; - const config = params.config ?? cfg.tools?.media?.[capability]; - if (config?.enabled === false) { - return { - outputs: [], - decision: { capability, outcome: "disabled", attachments: [] }, - }; - } - - const attachmentPolicy = config?.attachments; - const selected = selectAttachments({ - capability, - attachments: params.media, - policy: attachmentPolicy, - }); - if (selected.length === 0) { - return { - outputs: [], - decision: { capability, outcome: "no-attachment", attachments: [] }, - }; - } - - const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx }); - if (scopeDecision === "deny") { - if (shouldLogVerbose()) { - logVerbose(`${capability} understanding disabled by scope policy.`); - } - return { - outputs: [], - decision: { - capability, - outcome: "scope-deny", - attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })), - }, - }; - } - - // Skip image understanding when the primary model supports vision natively. - // The image will be injected directly into the model context instead. - const activeProvider = params.activeModel?.provider?.trim(); - if (capability === "image" && activeProvider) { - const catalog = await loadModelCatalog({ config: cfg }); - const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? ""); - if (modelSupportsVision(entry)) { - if (shouldLogVerbose()) { - logVerbose("Skipping image understanding: primary model supports vision natively"); - } - const model = params.activeModel?.model?.trim(); - const reason = "primary model supports vision natively"; - return { - outputs: [], - decision: { - capability, - outcome: "skipped", - attachments: selected.map((item) => { - const attempt = { - type: "provider" as const, - provider: activeProvider, - model: model || undefined, - outcome: "skipped" as const, - reason, - }; - return { - attachmentIndex: item.index, - attempts: [attempt], - chosen: attempt, - }; - }), - }, - }; - } - } - - const entries = resolveModelEntries({ - cfg, - capability, - config, - providerRegistry: params.providerRegistry, - }); - let resolvedEntries = entries; - if (resolvedEntries.length === 0) { - resolvedEntries = await resolveAutoEntries({ - cfg, - agentDir: params.agentDir, - providerRegistry: params.providerRegistry, - capability, - activeModel: params.activeModel, - }); - } - if (resolvedEntries.length === 0) { - return { - outputs: [], - decision: { - capability, - outcome: "skipped", - attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })), - }, - }; - } - - const outputs: MediaUnderstandingOutput[] = []; - const attachmentDecisions: MediaUnderstandingDecision["attachments"] = []; - for (const attachment of selected) { - const { output, attempts } = await runAttachmentEntries({ - capability, - cfg, - ctx, - attachmentIndex: attachment.index, - agentDir: params.agentDir, - providerRegistry: params.providerRegistry, - cache: params.attachments, - entries: resolvedEntries, - config, - }); - if (output) { - outputs.push(output); - } - attachmentDecisions.push({ - attachmentIndex: attachment.index, - attempts, - chosen: attempts.find((attempt) => attempt.outcome === "success"), - }); - } - const decision: MediaUnderstandingDecision = { - capability, - outcome: outputs.length > 0 ? "success" : "skipped", - attachments: attachmentDecisions, - }; - if (shouldLogVerbose()) { - logVerbose(`Media understanding ${formatDecisionSummary(decision)}`); - } - return { - outputs, - decision, - }; + return await runExtensionHostMediaCapability(params); }