From 37ee04e9b0fdd95b0bb28205086523e890cee5e4 Mon Sep 17 00:00:00 2001
From: Gustavo Madeira Santana <gumadeiras@gmail.com>
Date: Sun, 15 Mar 2026 19:52:50 +0000
Subject: [PATCH] Media: extract runtime auto and orchestration

---
 src/agents/model-auth.ts                      |   2 +-
 src/extension-host/media-runtime-auto.test.ts |  52 ++
 src/extension-host/media-runtime-auto.ts      | 499 ++++++++++++
 .../media-runtime-orchestration.test.ts       |  58 ++
 .../media-runtime-orchestration.ts            | 272 +++++++
 src/media-understanding/runner.ts             | 745 +-----------------
 6 files changed, 899 insertions(+), 729 deletions(-)
 create mode 100644 src/extension-host/media-runtime-auto.test.ts
 create mode 100644 src/extension-host/media-runtime-auto.ts
 create mode 100644 src/extension-host/media-runtime-orchestration.test.ts
 create mode 100644 src/extension-host/media-runtime-orchestration.ts

diff --git a/src/agents/model-auth.ts b/src/agents/model-auth.ts
index fb3abd1571e..19f591c2fe1 100644
--- a/src/agents/model-auth.ts
+++ b/src/agents/model-auth.ts
@@ -25,7 +25,7 @@ import {
   isNonSecretApiKeyMarker,
   OLLAMA_LOCAL_AUTH_MARKER,
 } from "./model-auth-markers.js";
-import { normalizeProviderId } from "./model-selection.js";
+import { normalizeProviderId } from "./provider-id.js";
 
 export { ensureAuthProfileStore, resolveAuthProfileOrder } from "./auth-profiles.js";
 
diff --git a/src/extension-host/media-runtime-auto.test.ts b/src/extension-host/media-runtime-auto.test.ts
new file mode 100644
index 00000000000..e33897e8431
--- /dev/null
+++ b/src/extension-host/media-runtime-auto.test.ts
@@ -0,0 +1,52 @@
+import { describe, expect, it } from "vitest";
+import type { OpenClawConfig } from "../config/config.js";
+import { DEFAULT_IMAGE_MODELS } from "../media-understanding/defaults.js";
+import { resolveAutoImageModel } from "./media-runtime-auto.js";
+import { buildExtensionHostMediaUnderstandingRegistry } from "./media-runtime-registry.js";
+
+function createImageCfg(): OpenClawConfig {
+  return {
+    models: {
+      providers: {
+        openai: {
+          apiKey: "test-key",
+          models: [],
+        },
+      },
+    },
+  } as unknown as OpenClawConfig;
+}
+
+describe("media runtime auto image model", () => {
+  it("keeps a valid active image model", async () => {
+    const result = await resolveAutoImageModel({
+      cfg: createImageCfg(),
+      providerRegistry: buildExtensionHostMediaUnderstandingRegistry(),
+      activeModel: {
+        provider: "openai",
+        model: "gpt-4.1-mini",
+      },
+    });
+
+    expect(result).toEqual({
+      provider: "openai",
+      model: "gpt-4.1-mini",
+    });
+  });
+
+  it("falls back to the default keyed image model when the active model cannot be used", async () => {
+    const result = await resolveAutoImageModel({
+      cfg: createImageCfg(),
+      providerRegistry: buildExtensionHostMediaUnderstandingRegistry(),
+      activeModel: {
+        provider: "missing-provider",
+        model: "ignored",
+      },
+    });
+
+    expect(result).toEqual({
+      provider: "openai",
+      model: DEFAULT_IMAGE_MODELS.openai,
+    });
+  });
+});
diff --git a/src/extension-host/media-runtime-auto.ts b/src/extension-host/media-runtime-auto.ts
new file mode 100644
index 00000000000..508666f14eb
--- /dev/null
+++ b/src/extension-host/media-runtime-auto.ts
@@ -0,0 +1,499 @@
+import { constants as fsConstants } from "node:fs";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { resolveApiKeyForProvider } from "../agents/model-auth.js";
+import type { OpenClawConfig } from "../config/config.js";
+import {
+  resolveAgentModelFallbackValues,
+  resolveAgentModelPrimaryValue,
+} from "../config/model-input.js";
+import type { MediaUnderstandingModelConfig } from "../config/types.tools.js";
+import {
+  getExtensionHostMediaUnderstandingProvider,
+  normalizeExtensionHostMediaProviderId,
+  type ExtensionHostMediaUnderstandingProviderRegistry,
+} from "../extension-host/media-runtime-registry.js";
+import {
+  AUTO_AUDIO_KEY_PROVIDERS,
+  AUTO_IMAGE_KEY_PROVIDERS,
+  AUTO_VIDEO_KEY_PROVIDERS,
+  DEFAULT_IMAGE_MODELS,
+} from "../media-understanding/defaults.js";
+import { fileExists } from "../media-understanding/fs.js";
+import { extractGeminiResponse } from "../media-understanding/output-extract.js";
+import type { MediaUnderstandingCapability } from "../media-understanding/types.js";
+import { runExec } from "../process/exec.js";
+
+export type ActiveMediaModel = {
+  provider: string;
+  model?: string;
+};
+
+type ProviderRegistry = ExtensionHostMediaUnderstandingProviderRegistry;
+
+const binaryCache = new Map<string, Promise<string | null>>();
+const geminiProbeCache = new Map<string, Promise<boolean>>();
+
+export function clearMediaUnderstandingBinaryCacheForTests(): void {
+  binaryCache.clear();
+  geminiProbeCache.clear();
+}
+
+function expandHomeDir(value: string): string {
+  if (!value.startsWith("~")) {
+    return value;
+  }
+  const home = os.homedir();
+  if (value === "~") {
+    return home;
+  }
+  if (value.startsWith("~/")) {
+    return path.join(home, value.slice(2));
+  }
+  return value;
+}
+
+function hasPathSeparator(value: string): boolean {
+  return value.includes("/") || value.includes("\\");
+}
+
+function candidateBinaryNames(name: string): string[] {
+  if (process.platform !== "win32") {
+    return [name];
+  }
+  const ext = path.extname(name);
+  if (ext) {
+    return [name];
+  }
+  const pathext = (process.env.PATHEXT ?? ".EXE;.CMD;.BAT;.COM")
+    .split(";")
+    .map((item) => item.trim())
+    .filter(Boolean)
+    .map((item) => (item.startsWith(".") ? item : `.${item}`));
+  const unique = Array.from(new Set(pathext));
+  return [name, ...unique.map((item) => `${name}${item}`)];
+}
+
+async function isExecutable(filePath: string): Promise<boolean> {
+  try {
+    const stat = await fs.stat(filePath);
+    if (!stat.isFile()) {
+      return false;
+    }
+    if (process.platform === "win32") {
+      return true;
+    }
+    await fs.access(filePath, fsConstants.X_OK);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+async function findBinary(name: string): Promise<string | null> {
+  const cached = binaryCache.get(name);
+  if (cached) {
+    return cached;
+  }
+  const resolved = (async () => {
+    const direct = expandHomeDir(name.trim());
+    if (direct && hasPathSeparator(direct)) {
+      for (const candidate of candidateBinaryNames(direct)) {
+        if (await isExecutable(candidate)) {
+          return candidate;
+        }
+      }
+    }
+
+    const searchName = name.trim();
+    if (!searchName) {
+      return null;
+    }
+    const pathEntries = (process.env.PATH ?? "").split(path.delimiter);
+    const candidates = candidateBinaryNames(searchName);
+    for (const entryRaw of pathEntries) {
+      const entry = expandHomeDir(entryRaw.trim().replace(/^"(.*)"$/, "$1"));
+      if (!entry) {
+        continue;
+      }
+      for (const candidate of candidates) {
+        const fullPath = path.join(entry, candidate);
+        if (await isExecutable(fullPath)) {
+          return fullPath;
+        }
+      }
+    }
+
+    return null;
+  })();
+  binaryCache.set(name, resolved);
+  return resolved;
+}
+
+async function hasBinary(name: string): Promise<boolean> {
+  return Boolean(await findBinary(name));
+}
+
+async function probeGeminiCli(): Promise<boolean> {
+  const cached = geminiProbeCache.get("gemini");
+  if (cached) {
+    return cached;
+  }
+  const resolved = (async () => {
+    if (!(await hasBinary("gemini"))) {
+      return false;
+    }
+    try {
+      const { stdout } = await runExec("gemini", ["--output-format", "json", "ok"], {
+        timeoutMs: 8000,
+      });
+      return Boolean(extractGeminiResponse(stdout) ?? stdout.toLowerCase().includes("ok"));
+    } catch {
+      return false;
+    }
+  })();
+  geminiProbeCache.set("gemini", resolved);
+  return resolved;
+}
+
+async function resolveLocalWhisperCppEntry(): Promise<MediaUnderstandingModelConfig | null> {
+  if (!(await hasBinary("whisper-cli"))) {
+    return null;
+  }
+  const envModel = process.env.WHISPER_CPP_MODEL?.trim();
+  const defaultModel = "/opt/homebrew/share/whisper-cpp/for-tests-ggml-tiny.bin";
+  const modelPath = envModel && (await fileExists(envModel)) ? envModel : defaultModel;
+  if (!(await fileExists(modelPath))) {
+    return null;
+  }
+  return {
+    type: "cli",
+    command: "whisper-cli",
+    args: ["-m", modelPath, "-otxt", "-of", "{{OutputBase}}", "-np", "-nt", "{{MediaPath}}"],
+  };
+}
+
+async function resolveLocalWhisperEntry(): Promise<MediaUnderstandingModelConfig | null> {
+  if (!(await hasBinary("whisper"))) {
+    return null;
+  }
+  return {
+    type: "cli",
+    command: "whisper",
+    args: [
+      "--model",
+      "turbo",
+      "--output_format",
+      "txt",
+      "--output_dir",
+      "{{OutputDir}}",
+      "--verbose",
+      "False",
+      "{{MediaPath}}",
+    ],
+  };
+}
+
+async function resolveSherpaOnnxEntry(): Promise<MediaUnderstandingModelConfig | null> {
+  if (!(await hasBinary("sherpa-onnx-offline"))) {
+    return null;
+  }
+  const modelDir = process.env.SHERPA_ONNX_MODEL_DIR?.trim();
+  if (!modelDir) {
+    return null;
+  }
+  const tokens = path.join(modelDir, "tokens.txt");
+  const encoder = path.join(modelDir, "encoder.onnx");
+  const decoder = path.join(modelDir, "decoder.onnx");
+  const joiner = path.join(modelDir, "joiner.onnx");
+  if (!(await fileExists(tokens))) {
+    return null;
+  }
+  if (!(await fileExists(encoder))) {
+    return null;
+  }
+  if (!(await fileExists(decoder))) {
+    return null;
+  }
+  if (!(await fileExists(joiner))) {
+    return null;
+  }
+  return {
+    type: "cli",
+    command: "sherpa-onnx-offline",
+    args: [
+      `--tokens=${tokens}`,
+      `--encoder=${encoder}`,
+      `--decoder=${decoder}`,
+      `--joiner=${joiner}`,
+      "{{MediaPath}}",
+    ],
+  };
+}
+
+async function resolveLocalAudioEntry(): Promise<MediaUnderstandingModelConfig | null> {
+  const sherpa = await resolveSherpaOnnxEntry();
+  if (sherpa) {
+    return sherpa;
+  }
+  const whisperCpp = await resolveLocalWhisperCppEntry();
+  if (whisperCpp) {
+    return whisperCpp;
+  }
+  return await resolveLocalWhisperEntry();
+}
+
+async function resolveGeminiCliEntry(
+  _capability: MediaUnderstandingCapability,
+): Promise<MediaUnderstandingModelConfig | null> {
+  if (!(await probeGeminiCli())) {
+    return null;
+  }
+  return {
+    type: "cli",
+    command: "gemini",
+    args: [
+      "--output-format",
+      "json",
+      "--allowed-tools",
+      "read_many_files",
+      "--include-directories",
+      "{{MediaDir}}",
+      "{{Prompt}}",
+      "Use read_many_files to read {{MediaPath}} and respond with only the text output.",
+    ],
+  };
+}
+
+async function resolveActiveModelEntry(params: {
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  providerRegistry: ProviderRegistry;
+  capability: MediaUnderstandingCapability;
+  activeModel?: ActiveMediaModel;
+}): Promise<MediaUnderstandingModelConfig | null> {
+  const activeProviderRaw = params.activeModel?.provider?.trim();
+  if (!activeProviderRaw) {
+    return null;
+  }
+  const providerId = normalizeExtensionHostMediaProviderId(activeProviderRaw);
+  if (!providerId) {
+    return null;
+  }
+  const provider = getExtensionHostMediaUnderstandingProvider(providerId, params.providerRegistry);
+  if (!provider) {
+    return null;
+  }
+  if (params.capability === "audio" && !provider.transcribeAudio) {
+    return null;
+  }
+  if (params.capability === "image" && !provider.describeImage) {
+    return null;
+  }
+  if (params.capability === "video" && !provider.describeVideo) {
+    return null;
+  }
+  try {
+    await resolveApiKeyForProvider({
+      provider: providerId,
+      cfg: params.cfg,
+      agentDir: params.agentDir,
+    });
+  } catch {
+    return null;
+  }
+  return {
+    type: "provider",
+    provider: providerId,
+    model: params.activeModel?.model,
+  };
+}
+
+async function resolveKeyEntry(params: {
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  providerRegistry: ProviderRegistry;
+  capability: MediaUnderstandingCapability;
+  activeModel?: ActiveMediaModel;
+}): Promise<MediaUnderstandingModelConfig | null> {
+  const { cfg, agentDir, providerRegistry, capability } = params;
+  const checkProvider = async (
+    providerId: string,
+    model?: string,
+  ): Promise<MediaUnderstandingModelConfig | null> => {
+    const provider = getExtensionHostMediaUnderstandingProvider(providerId, providerRegistry);
+    if (!provider) {
+      return null;
+    }
+    if (capability === "audio" && !provider.transcribeAudio) {
+      return null;
+    }
+    if (capability === "image" && !provider.describeImage) {
+      return null;
+    }
+    if (capability === "video" && !provider.describeVideo) {
+      return null;
+    }
+    try {
+      await resolveApiKeyForProvider({ provider: providerId, cfg, agentDir });
+      return { type: "provider", provider: providerId, model };
+    } catch {
+      return null;
+    }
+  };
+
+  if (capability === "image") {
+    const activeProvider = params.activeModel?.provider?.trim();
+    if (activeProvider) {
+      const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
+      if (activeEntry) {
+        return activeEntry;
+      }
+    }
+    for (const providerId of AUTO_IMAGE_KEY_PROVIDERS) {
+      const model = DEFAULT_IMAGE_MODELS[providerId];
+      const entry = await checkProvider(providerId, model);
+      if (entry) {
+        return entry;
+      }
+    }
+    return null;
+  }
+
+  if (capability === "video") {
+    const activeProvider = params.activeModel?.provider?.trim();
+    if (activeProvider) {
+      const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
+      if (activeEntry) {
+        return activeEntry;
+      }
+    }
+    for (const providerId of AUTO_VIDEO_KEY_PROVIDERS) {
+      const entry = await checkProvider(providerId, undefined);
+      if (entry) {
+        return entry;
+      }
+    }
+    return null;
+  }
+
+  const activeProvider = params.activeModel?.provider?.trim();
+  if (activeProvider) {
+    const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
+    if (activeEntry) {
+      return activeEntry;
+    }
+  }
+  for (const providerId of AUTO_AUDIO_KEY_PROVIDERS) {
+    const entry = await checkProvider(providerId, undefined);
+    if (entry) {
+      return entry;
+    }
+  }
+  return null;
+}
+
+function resolveImageModelFromAgentDefaults(cfg: OpenClawConfig): MediaUnderstandingModelConfig[] {
+  const refs: string[] = [];
+  const primary = resolveAgentModelPrimaryValue(cfg.agents?.defaults?.imageModel);
+  if (primary?.trim()) {
+    refs.push(primary.trim());
+  }
+  for (const fb of resolveAgentModelFallbackValues(cfg.agents?.defaults?.imageModel)) {
+    if (fb?.trim()) {
+      refs.push(fb.trim());
+    }
+  }
+  if (refs.length === 0) {
+    return [];
+  }
+  const entries: MediaUnderstandingModelConfig[] = [];
+  for (const ref of refs) {
+    const slashIdx = ref.indexOf("/");
+    if (slashIdx <= 0 || slashIdx >= ref.length - 1) {
+      continue;
+    }
+    entries.push({
+      type: "provider",
+      provider: ref.slice(0, slashIdx),
+      model: ref.slice(slashIdx + 1),
+    });
+  }
+  return entries;
+}
+
+export async function resolveAutoEntries(params: {
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  providerRegistry: ProviderRegistry;
+  capability: MediaUnderstandingCapability;
+  activeModel?: ActiveMediaModel;
+}): Promise<MediaUnderstandingModelConfig[]> {
+  const activeEntry = await resolveActiveModelEntry(params);
+  if (activeEntry) {
+    return [activeEntry];
+  }
+  if (params.capability === "audio") {
+    const localAudio = await resolveLocalAudioEntry();
+    if (localAudio) {
+      return [localAudio];
+    }
+  }
+  if (params.capability === "image") {
+    const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg);
+    if (imageModelEntries.length > 0) {
+      return imageModelEntries;
+    }
+  }
+  const gemini = await resolveGeminiCliEntry(params.capability);
+  if (gemini) {
+    return [gemini];
+  }
+  const keys = await resolveKeyEntry(params);
+  if (keys) {
+    return [keys];
+  }
+  return [];
+}
+
+export async function resolveAutoImageModel(params: {
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  activeModel?: ActiveMediaModel;
+  providerRegistry: ProviderRegistry;
+}): Promise<ActiveMediaModel | null> {
+  const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => {
+    if (!entry || entry.type === "cli") {
+      return null;
+    }
+    const provider = entry.provider;
+    if (!provider) {
+      return null;
+    }
+    const model = entry.model ?? DEFAULT_IMAGE_MODELS[provider];
+    if (!model) {
+      return null;
+    }
+    return { provider, model };
+  };
+  const activeEntry = await resolveActiveModelEntry({
+    cfg: params.cfg,
+    agentDir: params.agentDir,
+    providerRegistry: params.providerRegistry,
+    capability: "image",
+    activeModel: params.activeModel,
+  });
+  const resolvedActive = toActive(activeEntry);
+  if (resolvedActive) {
+    return resolvedActive;
+  }
+  const keyEntry = await resolveKeyEntry({
+    cfg: params.cfg,
+    agentDir: params.agentDir,
+    providerRegistry: params.providerRegistry,
+    capability: "image",
+    activeModel: params.activeModel,
+  });
+  return toActive(keyEntry);
+}
diff --git a/src/extension-host/media-runtime-orchestration.test.ts b/src/extension-host/media-runtime-orchestration.test.ts
new file mode 100644
index 00000000000..bb02eeac30c
--- /dev/null
+++ b/src/extension-host/media-runtime-orchestration.test.ts
@@ -0,0 +1,58 @@
+import { describe, expect, it, vi } from "vitest";
+import type { MsgContext } from "../auto-reply/templating.js";
+import type { OpenClawConfig } from "../config/config.js";
+import {
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+} from "../media-understanding/runner.js";
+import { runCapability } from "./media-runtime-orchestration.js";
+import { buildExtensionHostMediaUnderstandingRegistry } from "./media-runtime-registry.js";
+
+const catalog = [
+  {
+    id: "gpt-4.1",
+    name: "GPT-4.1",
+    provider: "openai",
+    input: ["text", "image"] as const,
+  },
+];
+
+vi.mock("../agents/model-catalog.js", async () => {
+  const actual = await vi.importActual<typeof import("../agents/model-catalog.js")>(
+    "../agents/model-catalog.js",
+  );
+  return {
+    ...actual,
+    loadModelCatalog: vi.fn(async () => catalog),
+  };
+});
+
+describe("media runtime orchestration", () => {
+  it("skips image understanding when the active model already supports vision", async () => {
+    const ctx: MsgContext = { MediaPath: "/tmp/image.png", MediaType: "image/png" };
+    const media = normalizeMediaAttachments(ctx);
+    const cache = createMediaAttachmentCache(media);
+    const cfg = {} as OpenClawConfig;
+
+    try {
+      const result = await runCapability({
+        capability: "image",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry: buildExtensionHostMediaUnderstandingRegistry(),
+        activeModel: { provider: "openai", model: "gpt-4.1" },
+      });
+
+      expect(result.outputs).toHaveLength(0);
+      expect(result.decision.outcome).toBe("skipped");
+      expect(result.decision.attachments).toHaveLength(1);
+      expect(result.decision.attachments[0]?.attempts[0]?.reason).toBe(
+        "primary model supports vision natively",
+      );
+    } finally {
+      await cache.cleanup();
+    }
+  });
+});
diff --git a/src/extension-host/media-runtime-orchestration.ts b/src/extension-host/media-runtime-orchestration.ts
new file mode 100644
index 00000000000..79161068de7
--- /dev/null
+++ b/src/extension-host/media-runtime-orchestration.ts
@@ -0,0 +1,272 @@
+import {
+  findModelInCatalog,
+  loadModelCatalog,
+  modelSupportsVision,
+} from "../agents/model-catalog.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import type { OpenClawConfig } from "../config/config.js";
+import type {
+  MediaUnderstandingConfig,
+  MediaUnderstandingModelConfig,
+} from "../config/types.tools.js";
+import { logVerbose, shouldLogVerbose } from "../globals.js";
+import { MediaAttachmentCache, selectAttachments } from "../media-understanding/attachments.js";
+import { isMediaUnderstandingSkipError } from "../media-understanding/errors.js";
+import { resolveModelEntries, resolveScopeDecision } from "../media-understanding/resolve.js";
+import {
+  buildModelDecision,
+  formatDecisionSummary,
+  runCliEntry,
+  runProviderEntry,
+} from "../media-understanding/runner.entries.js";
+import type {
+  MediaAttachment,
+  MediaUnderstandingCapability,
+  MediaUnderstandingDecision,
+  MediaUnderstandingModelDecision,
+  MediaUnderstandingOutput,
+  MediaUnderstandingProvider,
+} from "../media-understanding/types.js";
+import { resolveAutoEntries, type ActiveMediaModel } from "./media-runtime-auto.js";
+
+type ProviderRegistry = Map<string, MediaUnderstandingProvider>;
+
+export type RunCapabilityResult = {
+  outputs: MediaUnderstandingOutput[];
+  decision: MediaUnderstandingDecision;
+};
+
+async function runAttachmentEntries(params: {
+  capability: MediaUnderstandingCapability;
+  cfg: OpenClawConfig;
+  ctx: MsgContext;
+  attachmentIndex: number;
+  agentDir?: string;
+  providerRegistry: ProviderRegistry;
+  cache: MediaAttachmentCache;
+  entries: MediaUnderstandingModelConfig[];
+  config?: MediaUnderstandingConfig;
+}): Promise<{
+  output: MediaUnderstandingOutput | null;
+  attempts: MediaUnderstandingModelDecision[];
+}> {
+  const { entries, capability } = params;
+  const attempts: MediaUnderstandingModelDecision[] = [];
+  for (const entry of entries) {
+    const entryType = entry.type ?? (entry.command ? "cli" : "provider");
+    try {
+      const result =
+        entryType === "cli"
+          ? await runCliEntry({
+              capability,
+              entry,
+              cfg: params.cfg,
+              ctx: params.ctx,
+              attachmentIndex: params.attachmentIndex,
+              cache: params.cache,
+              config: params.config,
+            })
+          : await runProviderEntry({
+              capability,
+              entry,
+              cfg: params.cfg,
+              ctx: params.ctx,
+              attachmentIndex: params.attachmentIndex,
+              cache: params.cache,
+              agentDir: params.agentDir,
+              providerRegistry: params.providerRegistry,
+              config: params.config,
+            });
+      if (result) {
+        const decision = buildModelDecision({ entry, entryType, outcome: "success" });
+        if (result.provider) {
+          decision.provider = result.provider;
+        }
+        if (result.model) {
+          decision.model = result.model;
+        }
+        attempts.push(decision);
+        return { output: result, attempts };
+      }
+      attempts.push(
+        buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }),
+      );
+    } catch (err) {
+      if (isMediaUnderstandingSkipError(err)) {
+        attempts.push(
+          buildModelDecision({
+            entry,
+            entryType,
+            outcome: "skipped",
+            reason: `${err.reason}: ${err.message}`,
+          }),
+        );
+        if (shouldLogVerbose()) {
+          logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`);
+        }
+        continue;
+      }
+      attempts.push(
+        buildModelDecision({
+          entry,
+          entryType,
+          outcome: "failed",
+          reason: String(err),
+        }),
+      );
+      if (shouldLogVerbose()) {
+        logVerbose(`${capability} understanding failed: ${String(err)}`);
+      }
+    }
+  }
+
+  return { output: null, attempts };
+}
+
+export async function runCapability(params: {
+  capability: MediaUnderstandingCapability;
+  cfg: OpenClawConfig;
+  ctx: MsgContext;
+  attachments: MediaAttachmentCache;
+  media: MediaAttachment[];
+  agentDir?: string;
+  providerRegistry: ProviderRegistry;
+  config?: MediaUnderstandingConfig;
+  activeModel?: ActiveMediaModel;
+}): Promise<RunCapabilityResult> {
+  const { capability, cfg, ctx } = params;
+  const config = params.config ?? cfg.tools?.media?.[capability];
+  if (config?.enabled === false) {
+    return {
+      outputs: [],
+      decision: { capability, outcome: "disabled", attachments: [] },
+    };
+  }
+
+  const attachmentPolicy = config?.attachments;
+  const selected = selectAttachments({
+    capability,
+    attachments: params.media,
+    policy: attachmentPolicy,
+  });
+  if (selected.length === 0) {
+    return {
+      outputs: [],
+      decision: { capability, outcome: "no-attachment", attachments: [] },
+    };
+  }
+
+  const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx });
+  if (scopeDecision === "deny") {
+    if (shouldLogVerbose()) {
+      logVerbose(`${capability} understanding disabled by scope policy.`);
+    }
+    return {
+      outputs: [],
+      decision: {
+        capability,
+        outcome: "scope-deny",
+        attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
+      },
+    };
+  }
+
+  // Skip image understanding when the primary model supports vision natively.
+  // The image will be injected directly into the model context instead.
+  const activeProvider = params.activeModel?.provider?.trim();
+  if (capability === "image" && activeProvider) {
+    const catalog = await loadModelCatalog({ config: cfg });
+    const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
+    if (modelSupportsVision(entry)) {
+      if (shouldLogVerbose()) {
+        logVerbose("Skipping image understanding: primary model supports vision natively");
+      }
+      const model = params.activeModel?.model?.trim();
+      const reason = "primary model supports vision natively";
+      return {
+        outputs: [],
+        decision: {
+          capability,
+          outcome: "skipped",
+          attachments: selected.map((item) => {
+            const attempt = {
+              type: "provider" as const,
+              provider: activeProvider,
+              model: model || undefined,
+              outcome: "skipped" as const,
+              reason,
+            };
+            return {
+              attachmentIndex: item.index,
+              attempts: [attempt],
+              chosen: attempt,
+            };
+          }),
+        },
+      };
+    }
+  }
+
+  const entries = resolveModelEntries({
+    cfg,
+    capability,
+    config,
+    providerRegistry: params.providerRegistry,
+  });
+  let resolvedEntries = entries;
+  if (resolvedEntries.length === 0) {
+    resolvedEntries = await resolveAutoEntries({
+      cfg,
+      agentDir: params.agentDir,
+      providerRegistry: params.providerRegistry,
+      capability,
+      activeModel: params.activeModel,
+    });
+  }
+  if (resolvedEntries.length === 0) {
+    return {
+      outputs: [],
+      decision: {
+        capability,
+        outcome: "skipped",
+        attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
+      },
+    };
+  }
+
+  const outputs: MediaUnderstandingOutput[] = [];
+  const attachmentDecisions: MediaUnderstandingDecision["attachments"] = [];
+  for (const attachment of selected) {
+    const { output, attempts } = await runAttachmentEntries({
+      capability,
+      cfg,
+      ctx,
+      attachmentIndex: attachment.index,
+      agentDir: params.agentDir,
+      providerRegistry: params.providerRegistry,
+      cache: params.attachments,
+      entries: resolvedEntries,
+      config,
+    });
+    if (output) {
+      outputs.push(output);
+    }
+    attachmentDecisions.push({
+      attachmentIndex: attachment.index,
+      attempts,
+      chosen: attempts.find((attempt) => attempt.outcome === "success"),
+    });
+  }
+  const decision: MediaUnderstandingDecision = {
+    capability,
+    outcome: outputs.length > 0 ? "success" : "skipped",
+    attachments: attachmentDecisions,
+  };
+  if (shouldLogVerbose()) {
+    logVerbose(`Media understanding ${formatDecisionSummary(decision)}`);
+  }
+  return {
+    outputs,
+    decision,
+  };
+}
diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts
index f4b9e09d13c..4cf2a923e9c 100644
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -1,77 +1,38 @@
-import { constants as fsConstants } from "node:fs";
-import fs from "node:fs/promises";
-import os from "node:os";
-import path from "node:path";
-import { resolveApiKeyForProvider } from "../agents/model-auth.js";
-import {
-  findModelInCatalog,
-  loadModelCatalog,
-  modelSupportsVision,
-} from "../agents/model-catalog.js";
 import type { MsgContext } from "../auto-reply/templating.js";
 import type { OpenClawConfig } from "../config/config.js";
+import type { MediaUnderstandingConfig } from "../config/types.tools.js";
 import {
-  resolveAgentModelFallbackValues,
-  resolveAgentModelPrimaryValue,
-} from "../config/model-input.js";
-import type {
-  MediaUnderstandingConfig,
-  MediaUnderstandingModelConfig,
-} from "../config/types.tools.js";
+  clearMediaUnderstandingBinaryCacheForTests as clearExtensionHostMediaUnderstandingBinaryCacheForTests,
+  resolveAutoImageModel as resolveExtensionHostAutoImageModel,
+  type ActiveMediaModel,
+} from "../extension-host/media-runtime-auto.js";
+import {
+  runCapability as runExtensionHostMediaCapability,
+  type RunCapabilityResult,
+} from "../extension-host/media-runtime-orchestration.js";
 import {
   buildExtensionHostMediaUnderstandingRegistry,
-  getExtensionHostMediaUnderstandingProvider,
-  normalizeExtensionHostMediaProviderId,
+  type ExtensionHostMediaUnderstandingProviderRegistry,
 } from "../extension-host/media-runtime-registry.js";
-import { logVerbose, shouldLogVerbose } from "../globals.js";
 import {
   mergeInboundPathRoots,
   resolveIMessageAttachmentRoots,
 } from "../media/inbound-path-policy.js";
 import { getDefaultMediaLocalRoots } from "../media/local-roots.js";
-import { runExec } from "../process/exec.js";
 import {
   MediaAttachmentCache,
   type MediaAttachmentCacheOptions,
   normalizeAttachments,
-  selectAttachments,
 } from "./attachments.js";
-import {
-  AUTO_AUDIO_KEY_PROVIDERS,
-  AUTO_IMAGE_KEY_PROVIDERS,
-  AUTO_VIDEO_KEY_PROVIDERS,
-  DEFAULT_IMAGE_MODELS,
-} from "./defaults.js";
-import { isMediaUnderstandingSkipError } from "./errors.js";
-import { fileExists } from "./fs.js";
-import { extractGeminiResponse } from "./output-extract.js";
-import { resolveModelEntries, resolveScopeDecision } from "./resolve.js";
-import {
-  buildModelDecision,
-  formatDecisionSummary,
-  runCliEntry,
-  runProviderEntry,
-} from "./runner.entries.js";
 import type {
   MediaAttachment,
   MediaUnderstandingCapability,
-  MediaUnderstandingDecision,
-  MediaUnderstandingModelDecision,
-  MediaUnderstandingOutput,
   MediaUnderstandingProvider,
 } from "./types.js";
 
-export type ActiveMediaModel = {
-  provider: string;
-  model?: string;
-};
+type ProviderRegistry = ExtensionHostMediaUnderstandingProviderRegistry;
 
-type ProviderRegistry = Map<string, MediaUnderstandingProvider>;
-
-export type RunCapabilityResult = {
-  outputs: MediaUnderstandingOutput[];
-  decision: MediaUnderstandingDecision;
-};
+export type { ActiveMediaModel, RunCapabilityResult };
 
 export function buildProviderRegistry(
   overrides?: Record<string, MediaUnderstandingProvider>,
@@ -103,385 +64,8 @@ export function createMediaAttachmentCache(
   return new MediaAttachmentCache(attachments, options);
 }
 
-const binaryCache = new Map<string, Promise<string | null>>();
-const geminiProbeCache = new Map<string, Promise<boolean>>();
-
 export function clearMediaUnderstandingBinaryCacheForTests(): void {
-  binaryCache.clear();
-  geminiProbeCache.clear();
-}
-
-function expandHomeDir(value: string): string {
-  if (!value.startsWith("~")) {
-    return value;
-  }
-  const home = os.homedir();
-  if (value === "~") {
-    return home;
-  }
-  if (value.startsWith("~/")) {
-    return path.join(home, value.slice(2));
-  }
-  return value;
-}
-
-function hasPathSeparator(value: string): boolean {
-  return value.includes("/") || value.includes("\\");
-}
-
-function candidateBinaryNames(name: string): string[] {
-  if (process.platform !== "win32") {
-    return [name];
-  }
-  const ext = path.extname(name);
-  if (ext) {
-    return [name];
-  }
-  const pathext = (process.env.PATHEXT ?? ".EXE;.CMD;.BAT;.COM")
-    .split(";")
-    .map((item) => item.trim())
-    .filter(Boolean)
-    .map((item) => (item.startsWith(".") ? item : `.${item}`));
-  const unique = Array.from(new Set(pathext));
-  return [name, ...unique.map((item) => `${name}${item}`)];
-}
-
-async function isExecutable(filePath: string): Promise<boolean> {
-  try {
-    const stat = await fs.stat(filePath);
-    if (!stat.isFile()) {
-      return false;
-    }
-    if (process.platform === "win32") {
-      return true;
-    }
-    await fs.access(filePath, fsConstants.X_OK);
-    return true;
-  } catch {
-    return false;
-  }
-}
-
-async function findBinary(name: string): Promise<string | null> {
-  const cached = binaryCache.get(name);
-  if (cached) {
-    return cached;
-  }
-  const resolved = (async () => {
-    const direct = expandHomeDir(name.trim());
-    if (direct && hasPathSeparator(direct)) {
-      for (const candidate of candidateBinaryNames(direct)) {
-        if (await isExecutable(candidate)) {
-          return candidate;
-        }
-      }
-    }
-
-    const searchName = name.trim();
-    if (!searchName) {
-      return null;
-    }
-    const pathEntries = (process.env.PATH ?? "").split(path.delimiter);
-    const candidates = candidateBinaryNames(searchName);
-    for (const entryRaw of pathEntries) {
-      const entry = expandHomeDir(entryRaw.trim().replace(/^"(.*)"$/, "$1"));
-      if (!entry) {
-        continue;
-      }
-      for (const candidate of candidates) {
-        const fullPath = path.join(entry, candidate);
-        if (await isExecutable(fullPath)) {
-          return fullPath;
-        }
-      }
-    }
-
-    return null;
-  })();
-  binaryCache.set(name, resolved);
-  return resolved;
-}
-
-async function hasBinary(name: string): Promise<boolean> {
-  return Boolean(await findBinary(name));
-}
-
-async function probeGeminiCli(): Promise<boolean> {
-  const cached = geminiProbeCache.get("gemini");
-  if (cached) {
-    return cached;
-  }
-  const resolved = (async () => {
-    if (!(await hasBinary("gemini"))) {
-      return false;
-    }
-    try {
-      const { stdout } = await runExec("gemini", ["--output-format", "json", "ok"], {
-        timeoutMs: 8000,
-      });
-      return Boolean(extractGeminiResponse(stdout) ?? stdout.toLowerCase().includes("ok"));
-    } catch {
-      return false;
-    }
-  })();
-  geminiProbeCache.set("gemini", resolved);
-  return resolved;
-}
-
-async function resolveLocalWhisperCppEntry(): Promise<MediaUnderstandingModelConfig | null> {
-  if (!(await hasBinary("whisper-cli"))) {
-    return null;
-  }
-  const envModel = process.env.WHISPER_CPP_MODEL?.trim();
-  const defaultModel = "/opt/homebrew/share/whisper-cpp/for-tests-ggml-tiny.bin";
-  const modelPath = envModel && (await fileExists(envModel)) ? envModel : defaultModel;
-  if (!(await fileExists(modelPath))) {
-    return null;
-  }
-  return {
-    type: "cli",
-    command: "whisper-cli",
-    args: ["-m", modelPath, "-otxt", "-of", "{{OutputBase}}", "-np", "-nt", "{{MediaPath}}"],
-  };
-}
-
-async function resolveLocalWhisperEntry(): Promise<MediaUnderstandingModelConfig | null> {
-  if (!(await hasBinary("whisper"))) {
-    return null;
-  }
-  return {
-    type: "cli",
-    command: "whisper",
-    args: [
-      "--model",
-      "turbo",
-      "--output_format",
-      "txt",
-      "--output_dir",
-      "{{OutputDir}}",
-      "--verbose",
-      "False",
-      "{{MediaPath}}",
-    ],
-  };
-}
-
-async function resolveSherpaOnnxEntry(): Promise<MediaUnderstandingModelConfig | null> {
-  if (!(await hasBinary("sherpa-onnx-offline"))) {
-    return null;
-  }
-  const modelDir = process.env.SHERPA_ONNX_MODEL_DIR?.trim();
-  if (!modelDir) {
-    return null;
-  }
-  const tokens = path.join(modelDir, "tokens.txt");
-  const encoder = path.join(modelDir, "encoder.onnx");
-  const decoder = path.join(modelDir, "decoder.onnx");
-  const joiner = path.join(modelDir, "joiner.onnx");
-  if (!(await fileExists(tokens))) {
-    return null;
-  }
-  if (!(await fileExists(encoder))) {
-    return null;
-  }
-  if (!(await fileExists(decoder))) {
-    return null;
-  }
-  if (!(await fileExists(joiner))) {
-    return null;
-  }
-  return {
-    type: "cli",
-    command: "sherpa-onnx-offline",
-    args: [
-      `--tokens=${tokens}`,
-      `--encoder=${encoder}`,
-      `--decoder=${decoder}`,
-      `--joiner=${joiner}`,
-      "{{MediaPath}}",
-    ],
-  };
-}
-
-async function resolveLocalAudioEntry(): Promise<MediaUnderstandingModelConfig | null> {
-  const sherpa = await resolveSherpaOnnxEntry();
-  if (sherpa) {
-    return sherpa;
-  }
-  const whisperCpp = await resolveLocalWhisperCppEntry();
-  if (whisperCpp) {
-    return whisperCpp;
-  }
-  return await resolveLocalWhisperEntry();
-}
-
-async function resolveGeminiCliEntry(
-  _capability: MediaUnderstandingCapability,
-): Promise<MediaUnderstandingModelConfig | null> {
-  if (!(await probeGeminiCli())) {
-    return null;
-  }
-  return {
-    type: "cli",
-    command: "gemini",
-    args: [
-      "--output-format",
-      "json",
-      "--allowed-tools",
-      "read_many_files",
-      "--include-directories",
-      "{{MediaDir}}",
-      "{{Prompt}}",
-      "Use read_many_files to read {{MediaPath}} and respond with only the text output.",
-    ],
-  };
-}
-
-async function resolveKeyEntry(params: {
-  cfg: OpenClawConfig;
-  agentDir?: string;
-  providerRegistry: ProviderRegistry;
-  capability: MediaUnderstandingCapability;
-  activeModel?: ActiveMediaModel;
-}): Promise<MediaUnderstandingModelConfig | null> {
-  const { cfg, agentDir, providerRegistry, capability } = params;
-  const checkProvider = async (
-    providerId: string,
-    model?: string,
-  ): Promise<MediaUnderstandingModelConfig | null> => {
-    const provider = getExtensionHostMediaUnderstandingProvider(providerId, providerRegistry);
-    if (!provider) {
-      return null;
-    }
-    if (capability === "audio" && !provider.transcribeAudio) {
-      return null;
-    }
-    if (capability === "image" && !provider.describeImage) {
-      return null;
-    }
-    if (capability === "video" && !provider.describeVideo) {
-      return null;
-    }
-    try {
-      await resolveApiKeyForProvider({ provider: providerId, cfg, agentDir });
-      return { type: "provider" as const, provider: providerId, model };
-    } catch {
-      return null;
-    }
-  };
-
-  if (capability === "image") {
-    const activeProvider = params.activeModel?.provider?.trim();
-    if (activeProvider) {
-      const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
-      if (activeEntry) {
-        return activeEntry;
-      }
-    }
-    for (const providerId of AUTO_IMAGE_KEY_PROVIDERS) {
-      const model = DEFAULT_IMAGE_MODELS[providerId];
-      const entry = await checkProvider(providerId, model);
-      if (entry) {
-        return entry;
-      }
-    }
-    return null;
-  }
-
-  if (capability === "video") {
-    const activeProvider = params.activeModel?.provider?.trim();
-    if (activeProvider) {
-      const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
-      if (activeEntry) {
-        return activeEntry;
-      }
-    }
-    for (const providerId of AUTO_VIDEO_KEY_PROVIDERS) {
-      const entry = await checkProvider(providerId, undefined);
-      if (entry) {
-        return entry;
-      }
-    }
-    return null;
-  }
-
-  const activeProvider = params.activeModel?.provider?.trim();
-  if (activeProvider) {
-    const activeEntry = await checkProvider(activeProvider, params.activeModel?.model);
-    if (activeEntry) {
-      return activeEntry;
-    }
-  }
-  for (const providerId of AUTO_AUDIO_KEY_PROVIDERS) {
-    const entry = await checkProvider(providerId, undefined);
-    if (entry) {
-      return entry;
-    }
-  }
-  return null;
-}
-
-function resolveImageModelFromAgentDefaults(cfg: OpenClawConfig): MediaUnderstandingModelConfig[] {
-  const refs: string[] = [];
-  const primary = resolveAgentModelPrimaryValue(cfg.agents?.defaults?.imageModel);
-  if (primary?.trim()) {
-    refs.push(primary.trim());
-  }
-  for (const fb of resolveAgentModelFallbackValues(cfg.agents?.defaults?.imageModel)) {
-    if (fb?.trim()) {
-      refs.push(fb.trim());
-    }
-  }
-  if (refs.length === 0) {
-    return [];
-  }
-  const entries: MediaUnderstandingModelConfig[] = [];
-  for (const ref of refs) {
-    const slashIdx = ref.indexOf("/");
-    if (slashIdx <= 0 || slashIdx >= ref.length - 1) {
-      continue;
-    }
-    entries.push({
-      type: "provider",
-      provider: ref.slice(0, slashIdx),
-      model: ref.slice(slashIdx + 1),
-    });
-  }
-  return entries;
-}
-
-async function resolveAutoEntries(params: {
-  cfg: OpenClawConfig;
-  agentDir?: string;
-  providerRegistry: ProviderRegistry;
-  capability: MediaUnderstandingCapability;
-  activeModel?: ActiveMediaModel;
-}): Promise<MediaUnderstandingModelConfig[]> {
-  const activeEntry = await resolveActiveModelEntry(params);
-  if (activeEntry) {
-    return [activeEntry];
-  }
-  if (params.capability === "audio") {
-    const localAudio = await resolveLocalAudioEntry();
-    if (localAudio) {
-      return [localAudio];
-    }
-  }
-  if (params.capability === "image") {
-    const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg);
-    if (imageModelEntries.length > 0) {
-      return imageModelEntries;
-    }
-  }
-  const gemini = await resolveGeminiCliEntry(params.capability);
-  if (gemini) {
-    return [gemini];
-  }
-  const keys = await resolveKeyEntry(params);
-  if (keys) {
-    return [keys];
-  }
-  return [];
+  clearExtensionHostMediaUnderstandingBinaryCacheForTests();
 }
 
 export async function resolveAutoImageModel(params: {
@@ -489,171 +73,10 @@ export async function resolveAutoImageModel(params: {
   agentDir?: string;
   activeModel?: ActiveMediaModel;
 }): Promise<ActiveMediaModel | null> {
-  const providerRegistry = buildProviderRegistry();
-  const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => {
-    if (!entry || entry.type === "cli") {
-      return null;
-    }
-    const provider = entry.provider;
-    if (!provider) {
-      return null;
-    }
-    const model = entry.model ?? DEFAULT_IMAGE_MODELS[provider];
-    if (!model) {
-      return null;
-    }
-    return { provider, model };
-  };
-  const activeEntry = await resolveActiveModelEntry({
-    cfg: params.cfg,
-    agentDir: params.agentDir,
-    providerRegistry,
-    capability: "image",
-    activeModel: params.activeModel,
+  return await resolveExtensionHostAutoImageModel({
+    ...params,
+    providerRegistry: buildProviderRegistry(),
   });
-  const resolvedActive = toActive(activeEntry);
-  if (resolvedActive) {
-    return resolvedActive;
-  }
-  const keyEntry = await resolveKeyEntry({
-    cfg: params.cfg,
-    agentDir: params.agentDir,
-    providerRegistry,
-    capability: "image",
-    activeModel: params.activeModel,
-  });
-  return toActive(keyEntry);
-}
-
-async function resolveActiveModelEntry(params: {
-  cfg: OpenClawConfig;
-  agentDir?: string;
-  providerRegistry: ProviderRegistry;
-  capability: MediaUnderstandingCapability;
-  activeModel?: ActiveMediaModel;
-}): Promise<MediaUnderstandingModelConfig | null> {
-  const activeProviderRaw = params.activeModel?.provider?.trim();
-  if (!activeProviderRaw) {
-    return null;
-  }
-  const providerId = normalizeExtensionHostMediaProviderId(activeProviderRaw);
-  if (!providerId) {
-    return null;
-  }
-  const provider = getExtensionHostMediaUnderstandingProvider(providerId, params.providerRegistry);
-  if (!provider) {
-    return null;
-  }
-  if (params.capability === "audio" && !provider.transcribeAudio) {
-    return null;
-  }
-  if (params.capability === "image" && !provider.describeImage) {
-    return null;
-  }
-  if (params.capability === "video" && !provider.describeVideo) {
-    return null;
-  }
-  try {
-    await resolveApiKeyForProvider({
-      provider: providerId,
-      cfg: params.cfg,
-      agentDir: params.agentDir,
-    });
-  } catch {
-    return null;
-  }
-  return {
-    type: "provider",
-    provider: providerId,
-    model: params.activeModel?.model,
-  };
-}
-
-async function runAttachmentEntries(params: {
-  capability: MediaUnderstandingCapability;
-  cfg: OpenClawConfig;
-  ctx: MsgContext;
-  attachmentIndex: number;
-  agentDir?: string;
-  providerRegistry: ProviderRegistry;
-  cache: MediaAttachmentCache;
-  entries: MediaUnderstandingModelConfig[];
-  config?: MediaUnderstandingConfig;
-}): Promise<{
-  output: MediaUnderstandingOutput | null;
-  attempts: MediaUnderstandingModelDecision[];
-}> {
-  const { entries, capability } = params;
-  const attempts: MediaUnderstandingModelDecision[] = [];
-  for (const entry of entries) {
-    const entryType = entry.type ?? (entry.command ? "cli" : "provider");
-    try {
-      const result =
-        entryType === "cli"
-          ? await runCliEntry({
-              capability,
-              entry,
-              cfg: params.cfg,
-              ctx: params.ctx,
-              attachmentIndex: params.attachmentIndex,
-              cache: params.cache,
-              config: params.config,
-            })
-          : await runProviderEntry({
-              capability,
-              entry,
-              cfg: params.cfg,
-              ctx: params.ctx,
-              attachmentIndex: params.attachmentIndex,
-              cache: params.cache,
-              agentDir: params.agentDir,
-              providerRegistry: params.providerRegistry,
-              config: params.config,
-            });
-      if (result) {
-        const decision = buildModelDecision({ entry, entryType, outcome: "success" });
-        if (result.provider) {
-          decision.provider = result.provider;
-        }
-        if (result.model) {
-          decision.model = result.model;
-        }
-        attempts.push(decision);
-        return { output: result, attempts };
-      }
-      attempts.push(
-        buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }),
-      );
-    } catch (err) {
-      if (isMediaUnderstandingSkipError(err)) {
-        attempts.push(
-          buildModelDecision({
-            entry,
-            entryType,
-            outcome: "skipped",
-            reason: `${err.reason}: ${err.message}`,
-          }),
-        );
-        if (shouldLogVerbose()) {
-          logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`);
-        }
-        continue;
-      }
-      attempts.push(
-        buildModelDecision({
-          entry,
-          entryType,
-          outcome: "failed",
-          reason: String(err),
-        }),
-      );
-      if (shouldLogVerbose()) {
-        logVerbose(`${capability} understanding failed: ${String(err)}`);
-      }
-    }
-  }
-
-  return { output: null, attempts };
 }
 
 export async function runCapability(params: {
@@ -667,139 +90,5 @@ export async function runCapability(params: {
   config?: MediaUnderstandingConfig;
   activeModel?: ActiveMediaModel;
 }): Promise<RunCapabilityResult> {
-  const { capability, cfg, ctx } = params;
-  const config = params.config ?? cfg.tools?.media?.[capability];
-  if (config?.enabled === false) {
-    return {
-      outputs: [],
-      decision: { capability, outcome: "disabled", attachments: [] },
-    };
-  }
-
-  const attachmentPolicy = config?.attachments;
-  const selected = selectAttachments({
-    capability,
-    attachments: params.media,
-    policy: attachmentPolicy,
-  });
-  if (selected.length === 0) {
-    return {
-      outputs: [],
-      decision: { capability, outcome: "no-attachment", attachments: [] },
-    };
-  }
-
-  const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx });
-  if (scopeDecision === "deny") {
-    if (shouldLogVerbose()) {
-      logVerbose(`${capability} understanding disabled by scope policy.`);
-    }
-    return {
-      outputs: [],
-      decision: {
-        capability,
-        outcome: "scope-deny",
-        attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
-      },
-    };
-  }
-
-  // Skip image understanding when the primary model supports vision natively.
-  // The image will be injected directly into the model context instead.
-  const activeProvider = params.activeModel?.provider?.trim();
-  if (capability === "image" && activeProvider) {
-    const catalog = await loadModelCatalog({ config: cfg });
-    const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? "");
-    if (modelSupportsVision(entry)) {
-      if (shouldLogVerbose()) {
-        logVerbose("Skipping image understanding: primary model supports vision natively");
-      }
-      const model = params.activeModel?.model?.trim();
-      const reason = "primary model supports vision natively";
-      return {
-        outputs: [],
-        decision: {
-          capability,
-          outcome: "skipped",
-          attachments: selected.map((item) => {
-            const attempt = {
-              type: "provider" as const,
-              provider: activeProvider,
-              model: model || undefined,
-              outcome: "skipped" as const,
-              reason,
-            };
-            return {
-              attachmentIndex: item.index,
-              attempts: [attempt],
-              chosen: attempt,
-            };
-          }),
-        },
-      };
-    }
-  }
-
-  const entries = resolveModelEntries({
-    cfg,
-    capability,
-    config,
-    providerRegistry: params.providerRegistry,
-  });
-  let resolvedEntries = entries;
-  if (resolvedEntries.length === 0) {
-    resolvedEntries = await resolveAutoEntries({
-      cfg,
-      agentDir: params.agentDir,
-      providerRegistry: params.providerRegistry,
-      capability,
-      activeModel: params.activeModel,
-    });
-  }
-  if (resolvedEntries.length === 0) {
-    return {
-      outputs: [],
-      decision: {
-        capability,
-        outcome: "skipped",
-        attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
-      },
-    };
-  }
-
-  const outputs: MediaUnderstandingOutput[] = [];
-  const attachmentDecisions: MediaUnderstandingDecision["attachments"] = [];
-  for (const attachment of selected) {
-    const { output, attempts } = await runAttachmentEntries({
-      capability,
-      cfg,
-      ctx,
-      attachmentIndex: attachment.index,
-      agentDir: params.agentDir,
-      providerRegistry: params.providerRegistry,
-      cache: params.attachments,
-      entries: resolvedEntries,
-      config,
-    });
-    if (output) {
-      outputs.push(output);
-    }
-    attachmentDecisions.push({
-      attachmentIndex: attachment.index,
-      attempts,
-      chosen: attempts.find((attempt) => attempt.outcome === "success"),
-    });
-  }
-  const decision: MediaUnderstandingDecision = {
-    capability,
-    outcome: outputs.length > 0 ? "success" : "skipped",
-    attachments: attachmentDecisions,
-  };
-  if (shouldLogVerbose()) {
-    logVerbose(`Media understanding ${formatDecisionSummary(decision)}`);
-  }
-  return {
-    outputs,
-    decision,
-  };
+  return await runExtensionHostMediaCapability(params);
 }