From 106a994a2036307417b5248b5bb3d2438ceccd4b Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 13:40:52 -0400
Subject: [PATCH 01/16] fix: apply media understanding to followup-queued
 messages (#44682)

Voice notes arriving while the agent is mid-turn were queued as
followup messages without audio transcription. The followup runner
called runEmbeddedPiAgent directly, bypassing applyMediaUnderstanding.

This adds a mediaContext field to FollowupRun that snapshots the
original message's media fields. Before the agent run, the followup
runner checks whether media understanding was applied. If not (empty
MediaUnderstanding), it calls applyMediaUnderstanding and rebuilds the
prompt with the transcript, matching the primary path's formatting.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/auto-reply/reply/followup-runner.test.ts | 285 ++++++++++++++++++-
 src/auto-reply/reply/followup-runner.ts      |  65 ++++-
 src/auto-reply/reply/get-reply-run.ts        |  34 ++-
 src/auto-reply/reply/queue.ts                |   5 +-
 src/auto-reply/reply/queue/types.ts          |  37 +++
 5 files changed, 416 insertions(+), 10 deletions(-)
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 8d12e815685..132b1291b39 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -2,13 +2,14 @@ import fs from "node:fs/promises";
 import { tmpdir } from "node:os";
 import path from "node:path";
 import { beforeEach, describe, expect, it, vi } from "vitest";
-import { loadSessionStore, saveSessionStore, type SessionEntry } from "../../config/sessions.js";
+import { loadSessionStore, type SessionEntry, saveSessionStore } from "../../config/sessions.js";
 import type { FollowupRun } from "./queue.js";
 import { createMockFollowupRun, createMockTypingController } from "./test-helpers.js";
 
 const runEmbeddedPiAgentMock = vi.fn();
 const routeReplyMock = vi.fn();
 const isRoutableChannelMock = vi.fn();
+const applyMediaUnderstandingMock = vi.fn();
 
 vi.mock(
   "../../agents/model-fallback.js",
@@ -19,6 +20,10 @@ vi.mock("../../agents/pi-embedded.js", () => ({
   runEmbeddedPiAgent: (params: unknown) => runEmbeddedPiAgentMock(params),
 }));
 
+vi.mock("../../media-understanding/apply.js", () => ({
+  applyMediaUnderstanding: (params: unknown) => applyMediaUnderstandingMock(params),
+}));
+
 vi.mock("./route-reply.js", async (importOriginal) => {
   const actual = await importOriginal<typeof import("./route-reply.js")>();
   return {
@@ -47,13 +52,24 @@ beforeEach(() => {
   isRoutableChannelMock.mockImplementation((ch: string | undefined) =>
     Boolean(ch?.trim() && ROUTABLE_TEST_CHANNELS.has(ch.trim().toLowerCase())),
   );
+  applyMediaUnderstandingMock.mockReset();
+  applyMediaUnderstandingMock.mockResolvedValue({
+    outputs: [],
+    decisions: [],
+    appliedImage: false,
+    appliedAudio: false,
+    appliedVideo: false,
+    appliedFile: false,
+  });
 });
 
 const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun =>
   createMockFollowupRun({ run: { messageProvider } });
 
 function createQueuedRun(
-  overrides: Partial<Omit<FollowupRun, "run">> & { run?: Partial<FollowupRun["run"]> } = {},
+  overrides: Partial<Omit<FollowupRun, "run">> & {
+    run?: Partial<FollowupRun["run"]>;
+  } = {},
 ): FollowupRun {
   return createMockFollowupRun(overrides);
 }
@@ -294,7 +310,12 @@ describe("createFollowupRunner messaging tool dedupe", () => {
       agentResult: {
         ...makeTextReplyDedupeResult(),
         messagingToolSentTargets: [
-          { tool: "telegram", provider: "telegram", to: "268300329", accountId: "work" },
+          {
+            tool: "telegram",
+            provider: "telegram",
+            to: "268300329",
+            accountId: "work",
+          },
         ],
       },
       queued: {
@@ -344,8 +365,13 @@ describe("createFollowupRunner messaging tool dedupe", () => {
       "sessions.json",
     );
     const sessionKey = "main";
-    const sessionEntry: SessionEntry = { sessionId: "session", updatedAt: Date.now() };
-    const sessionStore: Record<string, SessionEntry> = { [sessionKey]: sessionEntry };
+    const sessionEntry: SessionEntry = {
+      sessionId: "session",
+      updatedAt: Date.now(),
+    };
+    const sessionStore: Record<string, SessionEntry> = {
+      [sessionKey]: sessionEntry,
+    };
     await saveSessionStore(storePath, sessionStore);
 
     const { onBlockReply } = await runMessagingCase({
@@ -539,7 +565,254 @@ describe("createFollowupRunner agentDir forwarding", () => {
     });
 
     expect(runEmbeddedPiAgentMock).toHaveBeenCalledTimes(1);
-    const call = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { agentDir?: string };
+    const call = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      agentDir?: string;
+    };
     expect(call?.agentDir).toBe(agentDir);
   });
 });
+
+describe("createFollowupRunner media understanding", () => {
+  it("applies audio transcription when mediaContext has untranscribed audio", async () => {
+    const transcriptText = "Hello, this is a voice note.";
+    // The real applyMediaUnderstanding mutates the ctx; the mock must do the same
+    // so buildInboundMediaNote sees MediaUnderstanding and suppresses the audio line.
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "Got it!" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text",
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+        // MediaUnderstanding is empty — transcription not yet applied
+      },
+    });
+    await runner(queued);
+
+    // applyMediaUnderstanding should have been called
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        cfg: queued.run.config,
+        agentDir: queued.run.agentDir,
+      }),
+    );
+
+    // The prompt passed to the agent should include the transcript, not the
+    // raw audio attachment line.
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).not.toContain("[media attached: /tmp/voice.ogg");
+
+    expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
+  });
+
+  it("skips media understanding when MediaUnderstanding is already populated", async () => {
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "reply" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt: "[Audio]\nTranscript:\nAlready transcribed.\n\nsome text",
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+        // MediaUnderstanding already populated — transcription was applied in primary path
+        MediaUnderstanding: [
+          {
+            kind: "audio.transcription",
+            text: "Already transcribed.",
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ],
+      },
+    });
+    await runner(queued);
+
+    // Should NOT re-run media understanding
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+
+    // The original prompt should be passed through unchanged
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("Already transcribed.");
+  });
+
+  it("skips media understanding when no mediaContext is present", async () => {
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "reply" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // No mediaContext (plain text message)
+    const queued = createQueuedRun({ prompt: "just text" });
+    await runner(queued);
+
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+  });
+
+  it("continues with raw prompt when media understanding fails", async () => {
+    applyMediaUnderstandingMock.mockRejectedValueOnce(new Error("transcription service down"));
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "fallback reply" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const originalPrompt = "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text";
+    const queued = createQueuedRun({
+      prompt: originalPrompt,
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+      },
+    });
+    await runner(queued);
+
+    // Should have attempted media understanding
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+
+    // Agent should still run with the original prompt
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toBe(originalPrompt);
+
+    expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "fallback reply" }));
+  });
+
+  it("preserves non-audio media lines when only audio is transcribed", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        // Simulate transcription updating the context
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: "voice transcript",
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = "voice transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "voice transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "got both" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt:
+        "[media attached: 2 files]\n[media attached 1/2: /tmp/voice.ogg (audio/ogg)]\n[media attached 2/2: /tmp/photo.jpg (image/jpeg)]\nsome text",
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg", "/tmp/photo.jpg"],
+        MediaTypes: ["audio/ogg", "image/jpeg"],
+      },
+    });
+    await runner(queued);
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    // Audio attachment line should be stripped
+    expect(agentCall?.prompt).not.toContain("voice.ogg");
+    // Image attachment line should also be stripped (all media-attached lines are
+    // removed and replaced by the new buildInboundMediaNote output)
+    // The transcript should be present
+    expect(agentCall?.prompt).toContain("voice transcript");
+  });
+});
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index 8c7eccb5f02..7a59cea3fbd 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -9,10 +9,13 @@ import type { SessionEntry } from "../../config/sessions.js";
 import type { TypingMode } from "../../config/types.js";
 import { logVerbose } from "../../globals.js";
 import { registerAgentRunContext } from "../../infra/agent-events.js";
+import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
+import { formatMediaUnderstandingBody } from "../../media-understanding/format.js";
 import { defaultRuntime } from "../../runtime.js";
 import { isInternalMessageChannel } from "../../utils/message-channel.js";
 import { stripHeartbeatToken } from "../heartbeat.js";
-import type { OriginatingChannelType } from "../templating.js";
+import { buildInboundMediaNote } from "../media-note.js";
+import type { MsgContext, OriginatingChannelType } from "../templating.js";
 import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveRunAuthProfile } from "./agent-runner-utils.js";
@@ -154,6 +157,66 @@ export function createFollowupRunner(params: {
       let bootstrapPromptWarningSignaturesSeen = resolveBootstrapWarningSignaturesSeen(
         activeSessionEntry?.systemPromptReport,
       );
+
+      // Apply media understanding for followup-queued messages when it was
+      // not applied (or failed) in the primary path.  This ensures voice
+      // notes that arrived while the agent was mid-turn still get transcribed.
+      if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) {
+        const hasMedia = Boolean(
+          queued.mediaContext.MediaPath?.trim() ||
+          (Array.isArray(queued.mediaContext.MediaPaths) &&
+            queued.mediaContext.MediaPaths.length > 0),
+        );
+        if (hasMedia) {
+          try {
+            const mediaCtx = { ...queued.mediaContext } as MsgContext;
+            const muResult = await applyMediaUnderstanding({
+              ctx: mediaCtx,
+              cfg: queued.run.config,
+              agentDir: queued.run.agentDir,
+              activeModel: {
+                provider: queued.run.provider,
+                model: queued.run.model,
+              },
+            });
+            if (muResult.outputs.length > 0) {
+              // Rebuild the prompt with media understanding results baked in,
+              // matching the primary path's formatting.
+              const newMediaNote = buildInboundMediaNote(mediaCtx);
+              const transcriptBody = formatMediaUnderstandingBody({
+                body: undefined,
+                outputs: muResult.outputs,
+              });
+
+              // Strip existing [media attached ...] lines from the prompt so
+              // they can be replaced by the updated media note (which excludes
+              // successfully-understood attachments like transcribed audio).
+              const stripped = queued.prompt
+                .replace(/\[media attached: \d+ files\]\n?/g, "")
+                .replace(/\[media attached[^\]]*\]\n?/g, "");
+
+              const parts: string[] = [];
+              if (newMediaNote) {
+                parts.push(newMediaNote);
+              }
+              if (transcriptBody) {
+                parts.push(transcriptBody);
+              }
+              parts.push(stripped.trim());
+              queued.prompt = parts.filter(Boolean).join("\n\n");
+
+              logVerbose(
+                `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage})`,
+              );
+            }
+          } catch (err) {
+            logVerbose(
+              `followup: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`,
+            );
+          }
+        }
+      }
+
       try {
         const fallbackResult = await runWithModelFallback({
           cfg: queued.run.config,
diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts
index 760c42aed1a..4cf76e6ed31 100644
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -384,7 +384,7 @@ export async function runPreparedReply(
   const mediaReplyHint = mediaNote
     ? "To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body."
     : undefined;
-  let prefixedCommandBody = mediaNote
+  const prefixedCommandBody = mediaNote
     ? [mediaNote, mediaReplyHint, prefixedBody ?? ""].filter(Boolean).join("\n").trim()
     : prefixedBody;
   if (!resolvedThinkLevel) {
@@ -469,11 +469,43 @@ export async function runPreparedReply(
     isNewSession,
   });
   const authProfileIdSource = sessionEntry?.authProfileOverrideSource;
+  // Snapshot media-related context for deferred media understanding in the
+  // followup runner.  When MediaUnderstanding is already populated the runner
+  // knows transcription already succeeded and skips re-application.
+  const hasMediaAttachments = Boolean(
+    ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0),
+  );
+  const mediaContext = hasMediaAttachments
+    ? {
+        Body: ctx.Body,
+        CommandBody: ctx.CommandBody,
+        RawBody: ctx.RawBody,
+        MediaPath: ctx.MediaPath,
+        MediaUrl: ctx.MediaUrl,
+        MediaType: ctx.MediaType,
+        MediaDir: ctx.MediaDir,
+        MediaPaths: ctx.MediaPaths ? [...ctx.MediaPaths] : undefined,
+        MediaUrls: ctx.MediaUrls ? [...ctx.MediaUrls] : undefined,
+        MediaTypes: ctx.MediaTypes ? [...ctx.MediaTypes] : undefined,
+        MediaRemoteHost: ctx.MediaRemoteHost,
+        Transcript: ctx.Transcript,
+        MediaUnderstanding: ctx.MediaUnderstanding ? [...ctx.MediaUnderstanding] : undefined,
+        MediaUnderstandingDecisions: ctx.MediaUnderstandingDecisions
+          ? [...ctx.MediaUnderstandingDecisions]
+          : undefined,
+        OriginatingChannel: ctx.OriginatingChannel,
+        OriginatingTo: ctx.OriginatingTo,
+        AccountId: ctx.AccountId,
+        MessageThreadId: ctx.MessageThreadId,
+      }
+    : undefined;
+
   const followupRun = {
     prompt: queuedBody,
     messageId: sessionCtx.MessageSidFull ?? sessionCtx.MessageSid,
     summaryLine: baseBodyTrimmedRaw,
     enqueuedAt: Date.now(),
+    mediaContext,
     // Originating channel for reply routing.
     originatingChannel: ctx.OriginatingChannel,
     originatingTo: ctx.OriginatingTo,
diff --git a/src/auto-reply/reply/queue.ts b/src/auto-reply/reply/queue.ts
index b097b6c5193..5c3a56f64af 100644
--- a/src/auto-reply/reply/queue.ts
+++ b/src/auto-reply/reply/queue.ts
@@ -1,6 +1,6 @@
-export { extractQueueDirective } from "./queue/directive.js";
-export { clearSessionQueues } from "./queue/cleanup.js";
 export type { ClearSessionQueueResult } from "./queue/cleanup.js";
+export { clearSessionQueues } from "./queue/cleanup.js";
+export { extractQueueDirective } from "./queue/directive.js";
 export { scheduleFollowupDrain } from "./queue/drain.js";
 export {
   enqueueFollowupRun,
@@ -10,6 +10,7 @@ export {
 export { resolveQueueSettings } from "./queue/settings.js";
 export { clearFollowupQueue } from "./queue/state.js";
 export type {
+  FollowupMediaContext,
   FollowupRun,
   QueueDedupeMode,
   QueueDropPolicy,
diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts
index 507f77d499d..637ceb0a149 100644
--- a/src/auto-reply/reply/queue/types.ts
+++ b/src/auto-reply/reply/queue/types.ts
@@ -2,6 +2,10 @@ import type { ExecToolDefaults } from "../../../agents/bash-tools.js";
 import type { SkillSnapshot } from "../../../agents/skills.js";
 import type { OpenClawConfig } from "../../../config/config.js";
 import type { SessionEntry } from "../../../config/sessions.js";
+import type {
+  MediaUnderstandingDecision,
+  MediaUnderstandingOutput,
+} from "../../../media-understanding/types.js";
 import type { InputProvenance } from "../../../sessions/input-provenance.js";
 import type { OriginatingChannelType } from "../../templating.js";
 import type { ElevatedLevel, ReasoningLevel, ThinkLevel, VerboseLevel } from "../directives.js";
@@ -19,12 +23,45 @@ export type QueueSettings = {
 
 export type QueueDedupeMode = "message-id" | "prompt" | "none";
 
+/**
+ * Snapshot of media-related context fields carried on a FollowupRun so that
+ * the followup runner can apply media understanding (e.g. voice-note
+ * transcription) when it was not applied — or failed — in the primary path.
+ */
+export type FollowupMediaContext = {
+  Body?: string;
+  CommandBody?: string;
+  RawBody?: string;
+  MediaPath?: string;
+  MediaUrl?: string;
+  MediaType?: string;
+  MediaDir?: string;
+  MediaPaths?: string[];
+  MediaUrls?: string[];
+  MediaTypes?: string[];
+  MediaRemoteHost?: string;
+  Transcript?: string;
+  MediaUnderstanding?: MediaUnderstandingOutput[];
+  MediaUnderstandingDecisions?: MediaUnderstandingDecision[];
+  OriginatingChannel?: OriginatingChannelType;
+  OriginatingTo?: string;
+  AccountId?: string;
+  MessageThreadId?: string | number;
+};
+
 export type FollowupRun = {
   prompt: string;
   /** Provider message ID, when available (for deduplication). */
   messageId?: string;
   summaryLine?: string;
   enqueuedAt: number;
+  /**
+   * Media context snapshot from the original inbound message.
+   * When present and MediaUnderstanding is empty, the followup runner will
+   * attempt to apply media understanding (audio transcription, etc.) before
+   * passing the prompt to the agent.
+   */
+  mediaContext?: FollowupMediaContext;
   /**
    * Originating channel for reply routing.
    * When set, replies should be routed back to this provider

From b6112bbfdf72c519bb05f25044c17ade2a0a1f16 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 15:29:55 -0400
Subject: [PATCH 02/16] fix queued media-understanding prompt rebuild

---
 src/auto-reply/reply/followup-runner.test.ts | 170 ++++++++++++++++++-
 src/auto-reply/reply/followup-runner.ts      | 116 ++++++++++---
 2 files changed, 260 insertions(+), 26 deletions(-)

diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 132b1291b39..1faac44a8a3 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -66,6 +66,9 @@ beforeEach(() => {
 const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun =>
   createMockFollowupRun({ run: { messageProvider } });
 
+const MEDIA_REPLY_HINT =
+  "To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body.";
+
 function createQueuedRun(
   overrides: Partial<Omit<FollowupRun, "run">> & {
     run?: Partial<FollowupRun["run"]>;
@@ -576,7 +579,7 @@ describe("createFollowupRunner media understanding", () => {
   it("applies audio transcription when mediaContext has untranscribed audio", async () => {
     const transcriptText = "Hello, this is a voice note.";
     // The real applyMediaUnderstanding mutates the ctx; the mock must do the same
-    // so buildInboundMediaNote sees MediaUnderstanding and suppresses the audio line.
+    // so buildInboundMediaNote and queued prompt rebuilding see the transcribed body.
     applyMediaUnderstandingMock.mockImplementationOnce(
       async (params: { ctx: Record<string, unknown> }) => {
         params.ctx.MediaUnderstanding = [
@@ -588,6 +591,7 @@ describe("createFollowupRunner media understanding", () => {
           },
         ];
         params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
         return {
           outputs: [
             {
@@ -764,6 +768,7 @@ describe("createFollowupRunner media understanding", () => {
           },
         ];
         params.ctx.Transcript = "voice transcript";
+        params.ctx.Body = "[Audio]\nUser text:\nsome text\nTranscript:\nvoice transcript";
         return {
           outputs: [
             {
@@ -815,4 +820,167 @@ describe("createFollowupRunner media understanding", () => {
     // The transcript should be present
     expect(agentCall?.prompt).toContain("voice transcript");
   });
+
+  it("strips queued media lines when attachment paths or URLs contain a literal closing bracket", async () => {
+    const transcriptText = "Bracket-safe transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "ok" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg?sig=abc]123]\n" +
+          MEDIA_REPLY_HINT +
+          "\n" +
+          "some text",
+        mediaContext: {
+          Body: "some text",
+          MediaPaths: ["/tmp/voice[0].ogg"],
+          MediaUrls: ["https://cdn.example.com/files[0].ogg?sig=abc]123"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).not.toContain("/tmp/voice[0].ogg");
+    expect(agentCall?.prompt).not.toContain("https://cdn.example.com/files[0].ogg?sig=abc]123");
+    expect(agentCall?.prompt).not.toContain(MEDIA_REPLY_HINT);
+  });
+
+  it("preserves file-only media understanding when outputs are empty", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body =
+          '<file name="report.pdf" mime="application/pdf">\nQuarterly report body\n</file>';
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\n[User sent media without caption]`,
+        mediaContext: {
+          Body: "",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[media attached: /tmp/report.pdf (application/pdf)]");
+    expect(agentCall?.prompt).toContain(MEDIA_REPLY_HINT);
+    expect(agentCall?.prompt).toContain('<file name="report.pdf" mime="application/pdf">');
+    expect(agentCall?.prompt).toContain("Quarterly report body");
+    expect(agentCall?.prompt).not.toContain("[User sent media without caption]");
+  });
+
+  it("replaces the queued body when inline directives were already stripped from the prompt", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body =
+          '/think high summarize this\n\n<file name="report.pdf" mime="application/pdf">\nreport\n</file>';
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).toContain('<file name="report.pdf" mime="application/pdf">');
+    expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this");
+    expect(agentCall?.prompt).not.toContain("/think high summarize this");
+  });
 });
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index 7a59cea3fbd..1688ffe9ccc 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -10,7 +10,6 @@ import type { TypingMode } from "../../config/types.js";
 import { logVerbose } from "../../globals.js";
 import { registerAgentRunContext } from "../../infra/agent-events.js";
 import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
-import { formatMediaUnderstandingBody } from "../../media-understanding/format.js";
 import { defaultRuntime } from "../../runtime.js";
 import { isInternalMessageChannel } from "../../utils/message-channel.js";
 import { stripHeartbeatToken } from "../heartbeat.js";
@@ -19,6 +18,7 @@ import type { MsgContext, OriginatingChannelType } from "../templating.js";
 import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveRunAuthProfile } from "./agent-runner-utils.js";
+import { parseInlineDirectives } from "./directive-handling.js";
 import {
   resolveOriginAccountId,
   resolveOriginMessageProvider,
@@ -37,6 +37,86 @@ import { incrementRunCompactionCount, persistRunSessionUsage } from "./session-r
 import { createTypingSignaler } from "./typing-mode.js";
 import type { TypingController } from "./typing.js";
 
+const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
+const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
+
+function stripLeadingMediaAttachedLines(prompt: string): string {
+  const lines = prompt.split("\n");
+  let index = 0;
+  while (index < lines.length) {
+    const trimmed = lines[index]?.trim() ?? "";
+    if (!trimmed.startsWith("[media attached") || !trimmed.endsWith("]")) {
+      break;
+    }
+    index += 1;
+  }
+  return lines.slice(index).join("\n").trim();
+}
+
+function stripLeadingMediaReplyHint(prompt: string): string {
+  const lines = prompt.split("\n");
+  if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) {
+    return lines.slice(1).join("\n").trim();
+  }
+  return prompt.trim();
+}
+
+function replaceLastOccurrence(
+  value: string,
+  search: string,
+  replacement: string,
+): string | undefined {
+  if (!search) {
+    return undefined;
+  }
+  const index = value.lastIndexOf(search);
+  if (index < 0) {
+    return undefined;
+  }
+  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
+}
+
+function stripInlineDirectives(text: string | undefined): string {
+  return parseInlineDirectives(text ?? "").cleaned.trim();
+}
+
+function rebuildQueuedPromptWithMediaUnderstanding(params: {
+  prompt: string;
+  originalBody?: string;
+  updatedBody?: string;
+  mediaNote?: string;
+}): string {
+  let stripped = stripLeadingMediaAttachedLines(params.prompt);
+  if (!params.mediaNote) {
+    stripped = stripLeadingMediaReplyHint(stripped);
+  }
+
+  const updatedBody = stripInlineDirectives(params.updatedBody);
+  if (!updatedBody) {
+    return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
+  }
+
+  const replacementTargets = [
+    params.originalBody?.trim(),
+    stripInlineDirectives(params.originalBody),
+    MEDIA_ONLY_PLACEHOLDER,
+  ].filter(
+    (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
+  );
+
+  let rebuilt = stripped;
+  for (const target of replacementTargets) {
+    const replaced = replaceLastOccurrence(rebuilt, target, updatedBody);
+    if (replaced !== undefined) {
+      rebuilt = replaced;
+      return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
+    }
+  }
+
+  rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n");
+  return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
+}
+
 export function createFollowupRunner(params: {
   opts?: GetReplyOptions;
   typing: TypingController;
@@ -170,6 +250,7 @@ export function createFollowupRunner(params: {
         if (hasMedia) {
           try {
             const mediaCtx = { ...queued.mediaContext } as MsgContext;
+            const originalBody = mediaCtx.Body;
             const muResult = await applyMediaUnderstanding({
               ctx: mediaCtx,
               cfg: queued.run.config,
@@ -179,34 +260,19 @@ export function createFollowupRunner(params: {
                 model: queued.run.model,
               },
             });
-            if (muResult.outputs.length > 0) {
-              // Rebuild the prompt with media understanding results baked in,
-              // matching the primary path's formatting.
+            if (muResult.outputs.length > 0 || muResult.appliedFile) {
+              // Rebuild the queued prompt from the mutated media context so the
+              // deferred path matches the primary path's prompt shape.
               const newMediaNote = buildInboundMediaNote(mediaCtx);
-              const transcriptBody = formatMediaUnderstandingBody({
-                body: undefined,
-                outputs: muResult.outputs,
+              queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({
+                prompt: queued.prompt,
+                originalBody,
+                updatedBody: mediaCtx.Body,
+                mediaNote: newMediaNote,
               });
 
-              // Strip existing [media attached ...] lines from the prompt so
-              // they can be replaced by the updated media note (which excludes
-              // successfully-understood attachments like transcribed audio).
-              const stripped = queued.prompt
-                .replace(/\[media attached: \d+ files\]\n?/g, "")
-                .replace(/\[media attached[^\]]*\]\n?/g, "");
-
-              const parts: string[] = [];
-              if (newMediaNote) {
-                parts.push(newMediaNote);
-              }
-              if (transcriptBody) {
-                parts.push(transcriptBody);
-              }
-              parts.push(stripped.trim());
-              queued.prompt = parts.filter(Boolean).join("\n\n");
-
               logVerbose(
-                `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage})`,
+                `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
               );
             }
           } catch (err) {

From dd19a53a8ed6a3156cc49c06c1ebd3d876690d44 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 15:45:17 -0400
Subject: [PATCH 03/16] fix: rebuild queued followup media prompts

---
 src/auto-reply/reply/followup-runner.test.ts | 180 +++++++++++++++++++
 src/auto-reply/reply/followup-runner.ts      |  21 ++-
 2 files changed, 196 insertions(+), 5 deletions(-)

diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 1faac44a8a3..f866c664763 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -653,6 +653,138 @@ describe("createFollowupRunner media understanding", () => {
     expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
   });
 
+  it("strips the full media line when attachment paths or URLs contain brackets", async () => {
+    const transcriptText = "Bracket-safe transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "done" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/voice[0].ogg"],
+          MediaUrls: ["https://cdn.example.com/files[0].ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).not.toContain("[media attached:");
+    expect(agentCall?.prompt).not.toContain("files[0].ogg]");
+  });
+
+  it("only strips leading synthetic media lines and preserves literal user text later in the prompt", async () => {
+    const transcriptText = "Transcript with literal token";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = "I literally typed [media attached: keep me] in this message.";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "done" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/voice.ogg (audio/ogg)]\nI literally typed [media attached: keep me] in this message.",
+        mediaContext: {
+          Body: "I literally typed [media attached: keep me] in this message.",
+          CommandBody: "I literally typed [media attached: keep me] in this message.",
+          RawBody: "I literally typed [media attached: keep me] in this message.",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(
+      "I literally typed [media attached: keep me] in this message.",
+    );
+    expect(agentCall?.prompt).not.toContain("[media attached: /tmp/voice.ogg (audio/ogg)]");
+  });
+
   it("skips media understanding when MediaUnderstanding is already populated", async () => {
     runEmbeddedPiAgentMock.mockResolvedValueOnce({
       payloads: [{ text: "reply" }],
@@ -755,6 +887,54 @@ describe("createFollowupRunner media understanding", () => {
     expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "fallback reply" }));
   });
 
+  it("rebuilds the prompt when file extraction succeeds without media outputs", async () => {
+    const fileBlock = '<file name="notes.txt" mime="text/plain">\nline one\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `some text\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "file processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/notes.txt (text/plain)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[media attached: /tmp/notes.txt (text/plain)]");
+    expect(agentCall?.prompt).toContain(fileBlock);
+    expect(agentCall?.prompt?.match(/<file\b/g)).toHaveLength(1);
+  });
+
   it("preserves non-audio media lines when only audio is transcribed", async () => {
     applyMediaUnderstandingMock.mockImplementationOnce(
       async (params: { ctx: Record<string, unknown> }) => {
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index 1688ffe9ccc..45534879e78 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -39,13 +39,16 @@ import type { TypingController } from "./typing.js";
 
 const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
 const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
+const LEADING_MEDIA_ATTACHED_LINE_RE =
+  /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/;
+const FILE_BLOCK_RE = /<file\b/i;
 
 function stripLeadingMediaAttachedLines(prompt: string): string {
   const lines = prompt.split("\n");
   let index = 0;
   while (index < lines.length) {
     const trimmed = lines[index]?.trim() ?? "";
-    if (!trimmed.startsWith("[media attached") || !trimmed.endsWith("]")) {
+    if (!LEADING_MEDIA_ATTACHED_LINE_RE.test(trimmed)) {
       break;
     }
     index += 1;
@@ -249,8 +252,14 @@ export function createFollowupRunner(params: {
         );
         if (hasMedia) {
           try {
-            const mediaCtx = { ...queued.mediaContext } as MsgContext;
-            const originalBody = mediaCtx.Body;
+            const mediaCtx = {
+              ...queued.mediaContext,
+              Body:
+                queued.mediaContext.CommandBody ??
+                queued.mediaContext.RawBody ??
+                queued.mediaContext.Body,
+            } as MsgContext;
+            const originalBody = queued.mediaContext.Body;
             const muResult = await applyMediaUnderstanding({
               ctx: mediaCtx,
               cfg: queued.run.config,
@@ -260,7 +269,10 @@ export function createFollowupRunner(params: {
                 model: queued.run.model,
               },
             });
-            if (muResult.outputs.length > 0 || muResult.appliedFile) {
+            const shouldRebuildPrompt =
+              muResult.outputs.length > 0 ||
+              (muResult.appliedFile && !FILE_BLOCK_RE.test(queued.prompt));
+            if (shouldRebuildPrompt) {
               // Rebuild the queued prompt from the mutated media context so the
               // deferred path matches the primary path's prompt shape.
               const newMediaNote = buildInboundMediaNote(mediaCtx);
@@ -270,7 +282,6 @@ export function createFollowupRunner(params: {
                 updatedBody: mediaCtx.Body,
                 mediaNote: newMediaNote,
               });
-
               logVerbose(
                 `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
               );

From 9dd7e2c791f0276803df166d18474f915a34c29a Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 16:56:36 -0400
Subject: [PATCH 04/16] fix: narrow FILE_BLOCK_RE, align originalBody, check
 body not prompt

---
 src/auto-reply/reply/followup-runner.test.ts | 109 +++++++++++++++++++
 src/auto-reply/reply/followup-runner.ts      |  21 ++--
 2 files changed, 123 insertions(+), 7 deletions(-)

diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index f866c664763..1956b450fa8 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -1163,4 +1163,113 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this");
     expect(agentCall?.prompt).not.toContain("/think high summarize this");
   });
+
+  it("does not false-positive on user text containing literal '<file' when extracting files", async () => {
+    const fileBlock = '<file name="data.csv" mime="text/csv">\ncol1,col2\n1,2\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `check my <file upload please\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "got it" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // User message contains literal "<file" text but that should NOT prevent
+    // file extraction results from being embedded in the prompt.
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
+        mediaContext: {
+          Body: "check my <file upload please",
+          CommandBody: "check my <file upload please",
+          RawBody: "check my <file upload please",
+          MediaPaths: ["/tmp/data.csv"],
+          MediaTypes: ["text/csv"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    // The file extraction result should be present in the prompt
+    expect(agentCall?.prompt).toContain(fileBlock);
+    expect(agentCall?.prompt).toContain("check my <file upload please");
+  });
+
+  it("uses resolved body (CommandBody) as originalBody for accurate prompt replacement", async () => {
+    const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        // applyMediaUnderstanding mutates the resolved body (which is CommandBody)
+        params.ctx.Body = `summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // Body has directive prefix; CommandBody has the cleaned version.
+    // The prompt was built from CommandBody, so originalBody should match CommandBody
+    // for accurate replacement.
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          CommandBody: "summarize this",
+          RawBody: "/think high summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    // File block should be present (extraction succeeded)
+    expect(agentCall?.prompt).toContain(fileBlock);
+    // The body text should appear once, not duplicated
+    expect(agentCall?.prompt).toContain("summarize this");
+    // Should NOT contain the directive prefix
+    expect(agentCall?.prompt).not.toContain("/think high");
+    // The body should not be duplicated (would happen if originalBody didn't match)
+    const matches = agentCall?.prompt?.match(/summarize this/g);
+    expect(matches?.length).toBe(1);
+  });
 });
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index 45534879e78..d2a2d4efd44 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -41,7 +41,7 @@ const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
 const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
 const LEADING_MEDIA_ATTACHED_LINE_RE =
   /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/;
-const FILE_BLOCK_RE = /<file\b/i;
+const FILE_BLOCK_RE = /<file\s+name="/i;
 
 function stripLeadingMediaAttachedLines(prompt: string): string {
   const lines = prompt.split("\n");
@@ -252,14 +252,21 @@ export function createFollowupRunner(params: {
         );
         if (hasMedia) {
           try {
+            const resolvedOriginalBody =
+              queued.mediaContext.CommandBody ??
+              queued.mediaContext.RawBody ??
+              queued.mediaContext.Body;
             const mediaCtx = {
               ...queued.mediaContext,
-              Body:
-                queued.mediaContext.CommandBody ??
-                queued.mediaContext.RawBody ??
-                queued.mediaContext.Body,
+              Body: resolvedOriginalBody,
             } as MsgContext;
-            const originalBody = queued.mediaContext.Body;
+            const originalBody = resolvedOriginalBody;
+            // Capture whether the resolved body already contains a file block
+            // BEFORE applyMediaUnderstanding mutates it — this detects prior
+            // extraction so we avoid double-inserting.  Checking the body
+            // (not the full queued.prompt) avoids false positives from user
+            // messages that happen to contain literal "<file path=" text.
+            const bodyAlreadyHasFileBlock = FILE_BLOCK_RE.test(resolvedOriginalBody ?? "");
             const muResult = await applyMediaUnderstanding({
               ctx: mediaCtx,
               cfg: queued.run.config,
@@ -271,7 +278,7 @@ export function createFollowupRunner(params: {
             });
             const shouldRebuildPrompt =
               muResult.outputs.length > 0 ||
-              (muResult.appliedFile && !FILE_BLOCK_RE.test(queued.prompt));
+              (muResult.appliedFile && !bodyAlreadyHasFileBlock);
             if (shouldRebuildPrompt) {
               // Rebuild the queued prompt from the mutated media context so the
               // deferred path matches the primary path's prompt shape.

From a472dce14735aeca65d69f2928950c6a9a574859 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 17:53:40 -0400
Subject: [PATCH 05/16] Auto-reply: preserve deferred media understanding
 output

---
 src/auto-reply/reply/followup-runner.test.ts  | 306 +++++++++++++++++-
 src/auto-reply/reply/followup-runner.ts       |  38 ++-
 .../reply/get-reply-run.media-only.test.ts    |  74 +++++
 src/auto-reply/reply/get-reply-run.ts         |  14 +-
 4 files changed, 424 insertions(+), 8 deletions(-)

diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 1956b450fa8..7d06c882c2c 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -653,6 +653,69 @@ describe("createFollowupRunner media understanding", () => {
     expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
   });
 
+  it("applies media understanding for URL-only attachments", async () => {
+    const transcriptText = "URL-only transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "Got it!" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: https://cdn.example.com/voice.ogg (audio/ogg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          MediaUrl: "https://cdn.example.com/voice.ogg",
+          MediaUrls: ["https://cdn.example.com/voice.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+  });
+
   it("strips the full media line when attachment paths or URLs contain brackets", async () => {
     const transcriptText = "Bracket-safe transcript";
     applyMediaUnderstandingMock.mockImplementationOnce(
@@ -1164,6 +1227,98 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).not.toContain("/think high summarize this");
   });
 
+  it("preserves directive-like tokens inside extracted media content", async () => {
+    const fileBlock =
+      '<file name="notes.txt" mime="text/plain">\n/model claude-opus should stay\n/queue followup should stay\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `/think high summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).not.toContain("/think high summarize this");
+    expect(agentCall?.prompt).toContain("/model claude-opus should stay");
+    expect(agentCall?.prompt).toContain("/queue followup should stay");
+  });
+
+  it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
+    const description = "[Image]\nDescription:\na mountain at sunset";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = description;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: true,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/photo.jpg (image/jpeg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          MediaPaths: ["/tmp/photo.jpg"],
+          MediaTypes: ["image/jpeg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("a mountain at sunset");
+  });
+
   it("does not false-positive on user text containing literal '<file' when extracting files", async () => {
     const fileBlock = '<file name="data.csv" mime="text/csv">\ncol1,col2\n1,2\n</file>';
     applyMediaUnderstandingMock.mockImplementationOnce(
@@ -1195,8 +1350,7 @@ describe("createFollowupRunner media understanding", () => {
     // file extraction results from being embedded in the prompt.
     await runner(
       createQueuedRun({
-        prompt:
-          "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
+        prompt: "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
         mediaContext: {
           Body: "check my <file upload please",
           CommandBody: "check my <file upload please",
@@ -1215,6 +1369,154 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).toContain("check my <file upload please");
   });
 
+  it("preserves directive-like text that appears inside extracted file content", async () => {
+    const fileBlock =
+      '<file name="notes.txt" mime="text/plain">\nRun `/think high` literally in the shell example.\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          CommandBody: "summarize this",
+          RawBody: "/think high summarize this",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example.");
+  });
+
+  it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items.";
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: true,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/board.jpg (image/jpeg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/board.jpg"],
+          MediaTypes: ["image/jpeg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[Image summary]");
+    expect(agentCall?.prompt).toContain("A whiteboard with action items.");
+  });
+
+  it("applies media understanding for URL-only deferred attachments", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = "[Audio]\nTranscript:\nremote transcript";
+        params.ctx.Transcript = "remote transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "remote transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[User sent media without caption]",
+        mediaContext: {
+          Body: "",
+          MediaUrl: "https://cdn.example.com/audio.ogg",
+          MediaUrls: ["https://cdn.example.com/audio.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("remote transcript");
+  });
+
   it("uses resolved body (CommandBody) as originalBody for accurate prompt replacement", async () => {
     const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
     applyMediaUnderstandingMock.mockImplementationOnce(
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index d2a2d4efd44..87e2f975ce0 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -39,8 +39,7 @@ import type { TypingController } from "./typing.js";
 
 const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
 const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
-const LEADING_MEDIA_ATTACHED_LINE_RE =
-  /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/;
+const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
 const FILE_BLOCK_RE = /<file\s+name="/i;
 
 function stripLeadingMediaAttachedLines(prompt: string): string {
@@ -83,6 +82,28 @@ function stripInlineDirectives(text: string | undefined): string {
   return parseInlineDirectives(text ?? "").cleaned.trim();
 }
 
+function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string {
+  const updatedBody = params.updatedBody?.trim();
+  if (!updatedBody) {
+    return "";
+  }
+  const originalBody = params.originalBody?.trim();
+  if (!originalBody) {
+    return updatedBody;
+  }
+
+  const cleanedOriginalBody = stripInlineDirectives(originalBody);
+  if (!cleanedOriginalBody) {
+    return updatedBody;
+  }
+  if (updatedBody === originalBody) {
+    return cleanedOriginalBody;
+  }
+  return (
+    replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody
+  ).trim();
+}
+
 function rebuildQueuedPromptWithMediaUnderstanding(params: {
   prompt: string;
   originalBody?: string;
@@ -94,7 +115,10 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
     stripped = stripLeadingMediaReplyHint(stripped);
   }
 
-  const updatedBody = stripInlineDirectives(params.updatedBody);
+  const updatedBody = normalizeUpdatedBody({
+    originalBody: params.originalBody,
+    updatedBody: params.updatedBody,
+  });
   if (!updatedBody) {
     return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
   }
@@ -247,8 +271,11 @@ export function createFollowupRunner(params: {
       if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) {
         const hasMedia = Boolean(
           queued.mediaContext.MediaPath?.trim() ||
+          queued.mediaContext.MediaUrl?.trim() ||
           (Array.isArray(queued.mediaContext.MediaPaths) &&
-            queued.mediaContext.MediaPaths.length > 0),
+            queued.mediaContext.MediaPaths.length > 0) ||
+          (Array.isArray(queued.mediaContext.MediaUrls) &&
+            queued.mediaContext.MediaUrls.length > 0),
         );
         if (hasMedia) {
           try {
@@ -278,6 +305,9 @@ export function createFollowupRunner(params: {
             });
             const shouldRebuildPrompt =
               muResult.outputs.length > 0 ||
+              muResult.appliedAudio ||
+              muResult.appliedImage ||
+              muResult.appliedVideo ||
               (muResult.appliedFile && !bodyAlreadyHasFileBlock);
             if (shouldRebuildPrompt) {
               // Rebuild the queued prompt from the mutated media context so the
diff --git a/src/auto-reply/reply/get-reply-run.media-only.test.ts b/src/auto-reply/reply/get-reply-run.media-only.test.ts
index 829b3937009..f519da10082 100644
--- a/src/auto-reply/reply/get-reply-run.media-only.test.ts
+++ b/src/auto-reply/reply/get-reply-run.media-only.test.ts
@@ -172,6 +172,45 @@ describe("runPreparedReply media-only handling", () => {
     expect(call?.followupRun.prompt).toContain("[User sent media without caption]");
   });
 
+  it("snapshots URL-only attachments into followup mediaContext", async () => {
+    await runPreparedReply(
+      baseParams({
+        ctx: {
+          Body: "check this attachment",
+          RawBody: "check this attachment",
+          CommandBody: "check this attachment",
+          ThreadHistoryBody: "Earlier message in this thread",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+          ChatType: "group",
+          MediaUrl: "https://cdn.example.com/input.png",
+          MediaUrls: ["https://cdn.example.com/input.png"],
+          MediaType: "image/png",
+          MediaTypes: ["image/png"],
+        },
+        sessionCtx: {
+          Body: "check this attachment",
+          BodyStripped: "check this attachment",
+          ThreadHistoryBody: "Earlier message in this thread",
+          Provider: "slack",
+          ChatType: "group",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+        },
+      }),
+    );
+
+    const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
+    expect(call?.followupRun.mediaContext).toEqual(
+      expect.objectContaining({
+        MediaUrl: "https://cdn.example.com/input.png",
+        MediaUrls: ["https://cdn.example.com/input.png"],
+        MediaType: "image/png",
+        MediaTypes: ["image/png"],
+      }),
+    );
+  });
+
   it("keeps thread history context on follow-up turns", async () => {
     const result = await runPreparedReply(
       baseParams({
@@ -186,6 +225,41 @@ describe("runPreparedReply media-only handling", () => {
     expect(call?.followupRun.prompt).toContain("Earlier message in this thread");
   });
 
+  it("snapshots mediaContext for URL-only deferred attachments", async () => {
+    await runPreparedReply(
+      baseParams({
+        ctx: {
+          Body: "",
+          RawBody: "",
+          CommandBody: "",
+          MediaUrl: "https://cdn.example.com/audio.ogg",
+          MediaUrls: ["https://cdn.example.com/audio.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+          ThreadHistoryBody: "Earlier message in this thread",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+          ChatType: "group",
+        },
+        sessionCtx: {
+          Body: "",
+          BodyStripped: "",
+          ThreadHistoryBody: "Earlier message in this thread",
+          Provider: "slack",
+          ChatType: "group",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+        },
+      }),
+    );
+
+    const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
+    expect(call?.followupRun.mediaContext?.MediaUrl).toBe("https://cdn.example.com/audio.ogg");
+    expect(call?.followupRun.mediaContext?.MediaUrls).toEqual([
+      "https://cdn.example.com/audio.ogg",
+    ]);
+  });
+
   it("returns the empty-body reply when there is no text and no media", async () => {
     const result = await runPreparedReply(
       baseParams({
diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts
index 4cf76e6ed31..32999a66238 100644
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -307,7 +307,14 @@ export async function runPreparedReply(
     : [inboundUserContext, baseBodyFinal].filter(Boolean).join("\n\n");
   const baseBodyTrimmed = baseBodyForPrompt.trim();
   const hasMediaAttachment = Boolean(
-    sessionCtx.MediaPath || (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0),
+    sessionCtx.MediaPath ||
+    sessionCtx.MediaUrl ||
+    (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0) ||
+    (sessionCtx.MediaUrls && sessionCtx.MediaUrls.length > 0) ||
+    ctx.MediaPath?.trim() ||
+    ctx.MediaUrl?.trim() ||
+    (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
+    (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
   );
   if (!baseBodyTrimmed && !hasMediaAttachment) {
     await typing.onReplyStart();
@@ -473,7 +480,10 @@ export async function runPreparedReply(
   // followup runner.  When MediaUnderstanding is already populated the runner
   // knows transcription already succeeded and skips re-application.
   const hasMediaAttachments = Boolean(
-    ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0),
+    ctx.MediaPath?.trim() ||
+    ctx.MediaUrl?.trim() ||
+    (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
+    (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
   );
   const mediaContext = hasMediaAttachments
     ? {

From 8a5ad5e320beb0b5d50fb8ab8a07ee246a754b01 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 21:54:35 -0400
Subject: [PATCH 06/16] Reply: preserve deferred queued media context

---
 src/auto-reply/reply/followup-media.ts       | 241 +++++++++++++++++++
 src/auto-reply/reply/followup-runner.test.ts | 225 ++++++++++++++++-
 src/auto-reply/reply/followup-runner.ts      | 180 +-------------
 src/auto-reply/reply/get-reply-run.ts        |   2 +
 src/auto-reply/reply/queue/drain.ts          |  71 +++++-
 src/auto-reply/reply/queue/enqueue.ts        |  37 ++-
 src/auto-reply/reply/queue/state.ts          |   3 +
 src/auto-reply/reply/queue/types.ts          |   3 +
 8 files changed, 572 insertions(+), 190 deletions(-)
 create mode 100644 src/auto-reply/reply/followup-media.ts

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
new file mode 100644
index 00000000000..5a014e63f9b
--- /dev/null
+++ b/src/auto-reply/reply/followup-media.ts
@@ -0,0 +1,241 @@
+import { logVerbose } from "../../globals.js";
+import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
+import {
+  normalizeAttachments,
+  resolveAttachmentKind,
+} from "../../media-understanding/attachments.js";
+import { buildInboundMediaNote } from "../media-note.js";
+import type { MsgContext } from "../templating.js";
+import { parseInlineDirectives } from "./directive-handling.js";
+import type { FollowupMediaContext, FollowupRun } from "./queue/types.js";
+
+const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
+const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
+const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
+const FILE_BLOCK_RE = /<file\s+name="/i;
+
+function stripLeadingMediaAttachedLines(prompt: string): string {
+  const lines = prompt.split("\n");
+  let index = 0;
+  while (index < lines.length) {
+    const trimmed = lines[index]?.trim() ?? "";
+    if (!LEADING_MEDIA_ATTACHED_LINE_RE.test(trimmed)) {
+      break;
+    }
+    index += 1;
+  }
+  return lines.slice(index).join("\n").trim();
+}
+
+function stripLeadingMediaReplyHint(prompt: string): string {
+  const lines = prompt.split("\n");
+  if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) {
+    return lines.slice(1).join("\n").trim();
+  }
+  return prompt.trim();
+}
+
+function replaceLastOccurrence(
+  value: string,
+  search: string,
+  replacement: string,
+): string | undefined {
+  if (!search) {
+    return undefined;
+  }
+  const index = value.lastIndexOf(search);
+  if (index < 0) {
+    return undefined;
+  }
+  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
+}
+
+function stripInlineDirectives(text: string | undefined): string {
+  return parseInlineDirectives(text ?? "").cleaned.trim();
+}
+
+function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string {
+  const updatedBody = params.updatedBody?.trim();
+  if (!updatedBody) {
+    return "";
+  }
+  const originalBody = params.originalBody?.trim();
+  if (!originalBody) {
+    return updatedBody;
+  }
+
+  const cleanedOriginalBody = stripInlineDirectives(originalBody);
+  if (!cleanedOriginalBody) {
+    return updatedBody;
+  }
+  if (updatedBody === originalBody) {
+    return cleanedOriginalBody;
+  }
+  return (
+    replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody
+  ).trim();
+}
+
+function rebuildQueuedPromptWithMediaUnderstanding(params: {
+  prompt: string;
+  originalBody?: string;
+  updatedBody?: string;
+  mediaNote?: string;
+}): string {
+  let stripped = stripLeadingMediaAttachedLines(params.prompt);
+  if (!params.mediaNote) {
+    stripped = stripLeadingMediaReplyHint(stripped);
+  }
+
+  const updatedBody = normalizeUpdatedBody({
+    originalBody: params.originalBody,
+    updatedBody: params.updatedBody,
+  });
+  if (!updatedBody) {
+    return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
+  }
+
+  const replacementTargets = [
+    params.originalBody?.trim(),
+    stripInlineDirectives(params.originalBody),
+    MEDIA_ONLY_PLACEHOLDER,
+  ].filter(
+    (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
+  );
+
+  let rebuilt = stripped;
+  for (const target of replacementTargets) {
+    const replaced = replaceLastOccurrence(rebuilt, target, updatedBody);
+    if (replaced !== undefined) {
+      rebuilt = replaced;
+      return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
+    }
+  }
+
+  rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n");
+  return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
+}
+
+function hasMediaAttachments(mediaContext: FollowupMediaContext): boolean {
+  return Boolean(
+    mediaContext.MediaPath?.trim() ||
+    mediaContext.MediaUrl?.trim() ||
+    (Array.isArray(mediaContext.MediaPaths) && mediaContext.MediaPaths.length > 0) ||
+    (Array.isArray(mediaContext.MediaUrls) && mediaContext.MediaUrls.length > 0),
+  );
+}
+
+function hasOnlyFileLikeAttachments(mediaContext: FollowupMediaContext): boolean {
+  const attachments = normalizeAttachments(mediaContext as MsgContext);
+  return (
+    attachments.length > 0 &&
+    attachments.every((attachment) => {
+      const kind = resolveAttachmentKind(attachment);
+      return kind !== "audio" && kind !== "image" && kind !== "video";
+    })
+  );
+}
+
+function snapshotUpdatedMediaContext(params: {
+  original: FollowupMediaContext;
+  mediaCtx: MsgContext;
+  updatedBody?: string;
+}): FollowupMediaContext {
+  return {
+    ...params.original,
+    Body: params.updatedBody ?? params.original.Body,
+    Transcript:
+      typeof params.mediaCtx.Transcript === "string"
+        ? params.mediaCtx.Transcript
+        : params.original.Transcript,
+    MediaUnderstanding: Array.isArray(params.mediaCtx.MediaUnderstanding)
+      ? [...params.mediaCtx.MediaUnderstanding]
+      : params.original.MediaUnderstanding,
+    MediaUnderstandingDecisions: Array.isArray(params.mediaCtx.MediaUnderstandingDecisions)
+      ? [...params.mediaCtx.MediaUnderstandingDecisions]
+      : params.original.MediaUnderstandingDecisions,
+    DeferredMediaApplied: true,
+  };
+}
+
+export async function applyDeferredMediaUnderstandingToQueuedRun(
+  queued: FollowupRun,
+  params: { logLabel?: string } = {},
+): Promise<void> {
+  const mediaContext = queued.mediaContext;
+  if (!mediaContext || mediaContext.DeferredMediaApplied) {
+    return;
+  }
+  if (mediaContext.MediaUnderstanding?.length) {
+    mediaContext.DeferredMediaApplied = true;
+    return;
+  }
+  if (!hasMediaAttachments(mediaContext)) {
+    mediaContext.DeferredMediaApplied = true;
+    return;
+  }
+
+  const resolvedOriginalBody =
+    mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body;
+  const bodyAlreadyHasFileBlock =
+    FILE_BLOCK_RE.test(resolvedOriginalBody ?? "") || FILE_BLOCK_RE.test(mediaContext.Body ?? "");
+
+  if (bodyAlreadyHasFileBlock && hasOnlyFileLikeAttachments(mediaContext)) {
+    mediaContext.DeferredMediaApplied = true;
+    return;
+  }
+
+  try {
+    const mediaCtx = {
+      ...mediaContext,
+      Body: resolvedOriginalBody,
+      Provider:
+        mediaContext.Provider ??
+        queued.run.messageProvider ??
+        (typeof mediaContext.OriginatingChannel === "string"
+          ? mediaContext.OriginatingChannel
+          : undefined),
+      Surface: mediaContext.Surface,
+    } as MsgContext;
+
+    const muResult = await applyMediaUnderstanding({
+      ctx: mediaCtx,
+      cfg: queued.run.config,
+      agentDir: queued.run.agentDir,
+      activeModel: {
+        provider: queued.run.provider,
+        model: queued.run.model,
+      },
+    });
+
+    const shouldRebuildPrompt =
+      muResult.outputs.length > 0 ||
+      muResult.appliedAudio ||
+      muResult.appliedImage ||
+      muResult.appliedVideo ||
+      (muResult.appliedFile && !bodyAlreadyHasFileBlock);
+
+    if (shouldRebuildPrompt) {
+      const newMediaNote = buildInboundMediaNote(mediaCtx);
+      queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({
+        prompt: queued.prompt,
+        originalBody: resolvedOriginalBody,
+        updatedBody: mediaCtx.Body,
+        mediaNote: newMediaNote,
+      });
+      logVerbose(
+        `${params.logLabel ?? "followup"}: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
+      );
+    }
+
+    queued.mediaContext = snapshotUpdatedMediaContext({
+      original: mediaContext,
+      mediaCtx,
+      updatedBody: shouldRebuildPrompt ? mediaCtx.Body : undefined,
+    });
+  } catch (err) {
+    logVerbose(
+      `${params.logLabel ?? "followup"}: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`,
+    );
+  }
+}
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index a8e31f998de..ea7448b74e5 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -3,7 +3,8 @@ import { tmpdir } from "node:os";
 import path from "node:path";
 import { beforeEach, describe, expect, it, vi } from "vitest";
 import { loadSessionStore, type SessionEntry, saveSessionStore } from "../../config/sessions.js";
-import type { FollowupRun } from "./queue.js";
+import { buildCollectPrompt } from "../../utils/queue-helpers.js";
+import type { FollowupRun } from "./queue/types.js";
 import { createMockFollowupRun, createMockTypingController } from "./test-helpers.js";
 
 const runEmbeddedPiAgentMock = vi.fn();
@@ -34,6 +35,10 @@ vi.mock("./route-reply.js", async (importOriginal) => {
 });
 
 import { createFollowupRunner } from "./followup-runner.js";
+import {
+  applyDeferredMediaToQueuedRuns,
+  buildMediaAwareQueueSummaryPrompt,
+} from "./queue/drain.js";
 
 const ROUTABLE_TEST_CHANNELS = new Set([
   "telegram",
@@ -757,6 +762,69 @@ describe("createFollowupRunner media understanding", () => {
     expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
   });
 
+  it("propagates the queued message provider into deferred media context", async () => {
+    const transcriptText = "Provider-aware transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        expect(params.ctx.Provider).toBe("telegram");
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "done" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[User sent media without caption]",
+        run: { messageProvider: "telegram" },
+        mediaContext: {
+          Body: "",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+  });
+
   it("applies media understanding for URL-only attachments", async () => {
     const transcriptText = "URL-only transcript";
     applyMediaUnderstandingMock.mockImplementationOnce(
@@ -1678,4 +1746,159 @@ describe("createFollowupRunner media understanding", () => {
     const matches = agentCall?.prompt?.match(/summarize this/g);
     expect(matches?.length).toBe(1);
   });
+
+  it("does not re-apply file extraction when the stored media body already has a file block", async () => {
+    const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${fileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${fileBlock}`,
+          CommandBody: "summarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt?.match(/<file\s+name="report\.pdf"/g)).toHaveLength(1);
+  });
+});
+
+describe("followup queue drain deferred media understanding", () => {
+  it("preprocesses collect batches before synthesizing the followup prompt", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: "collect transcript",
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = "collect transcript";
+        params.ctx.Body = "[Audio]\nTranscript:\ncollect transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "collect transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    const items: FollowupRun[] = [
+      createQueuedRun({
+        prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text",
+        summaryLine: "some text",
+        originatingChannel: "telegram",
+        originatingTo: "chat:1",
+        run: { messageProvider: "telegram" },
+        mediaContext: {
+          Body: "some text",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+      createQueuedRun({
+        prompt: "second text",
+        summaryLine: "second text",
+        originatingChannel: "telegram",
+        originatingTo: "chat:1",
+        run: { messageProvider: "telegram" },
+      }),
+    ];
+
+    await applyDeferredMediaToQueuedRuns(items);
+
+    const prompt = buildCollectPrompt({
+      title: "[Queued messages while agent was busy]",
+      items,
+      renderItem: (item, idx) => `---\nQueued #${idx + 1}\n${item.prompt}`.trim(),
+    });
+
+    expect(prompt).toContain("collect transcript");
+    expect(prompt).toContain("Queued #2\nsecond text");
+    expect(prompt).not.toContain("[media attached: /tmp/voice.ogg");
+  });
+
+  it("preprocesses dropped media items before building overflow summaries", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: "overflow transcript",
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = "overflow transcript";
+        params.ctx.Body = "[Audio]\nTranscript:\noverflow transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "overflow transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    const summaryPrompt = await buildMediaAwareQueueSummaryPrompt({
+      dropPolicy: "summarize",
+      droppedCount: 1,
+      summaryLines: ["[media attached: /tmp/voice.ogg (audio/ogg)]"],
+      summaryItems: [
+        createQueuedRun({
+          prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]",
+          summaryLine: "",
+          run: { messageProvider: "telegram" },
+          mediaContext: {
+            Body: "",
+            MediaPaths: ["/tmp/voice.ogg"],
+            MediaTypes: ["audio/ogg"],
+          },
+        }),
+      ],
+      noun: "message",
+    });
+
+    expect(summaryPrompt).toContain("[Queue overflow] Dropped 1 message due to cap.");
+    expect(summaryPrompt).toContain("overflow transcript");
+    expect(summaryPrompt).not.toContain("[media attached: /tmp/voice.ogg");
+  });
 });
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index d10d63d6389..50925cd1269 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -9,16 +9,14 @@ import type { SessionEntry } from "../../config/sessions.js";
 import type { TypingMode } from "../../config/types.js";
 import { logVerbose } from "../../globals.js";
 import { registerAgentRunContext } from "../../infra/agent-events.js";
-import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
 import { defaultRuntime } from "../../runtime.js";
 import { isInternalMessageChannel } from "../../utils/message-channel.js";
 import { stripHeartbeatToken } from "../heartbeat.js";
-import { buildInboundMediaNote } from "../media-note.js";
-import type { MsgContext, OriginatingChannelType } from "../templating.js";
+import type { OriginatingChannelType } from "../templating.js";
 import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveRunAuthProfile } from "./agent-runner-utils.js";
-import { parseInlineDirectives } from "./directive-handling.js";
+import { applyDeferredMediaUnderstandingToQueuedRun } from "./followup-media.js";
 import {
   resolveOriginAccountId,
   resolveOriginMessageProvider,
@@ -37,113 +35,6 @@ import { incrementRunCompactionCount, persistRunSessionUsage } from "./session-r
 import { createTypingSignaler } from "./typing-mode.js";
 import type { TypingController } from "./typing.js";
 
-const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
-const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
-const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
-const FILE_BLOCK_RE = /<file\s+name="/i;
-
-function stripLeadingMediaAttachedLines(prompt: string): string {
-  const lines = prompt.split("\n");
-  let index = 0;
-  while (index < lines.length) {
-    const trimmed = lines[index]?.trim() ?? "";
-    if (!LEADING_MEDIA_ATTACHED_LINE_RE.test(trimmed)) {
-      break;
-    }
-    index += 1;
-  }
-  return lines.slice(index).join("\n").trim();
-}
-
-function stripLeadingMediaReplyHint(prompt: string): string {
-  const lines = prompt.split("\n");
-  if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) {
-    return lines.slice(1).join("\n").trim();
-  }
-  return prompt.trim();
-}
-
-function replaceLastOccurrence(
-  value: string,
-  search: string,
-  replacement: string,
-): string | undefined {
-  if (!search) {
-    return undefined;
-  }
-  const index = value.lastIndexOf(search);
-  if (index < 0) {
-    return undefined;
-  }
-  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
-}
-
-function stripInlineDirectives(text: string | undefined): string {
-  return parseInlineDirectives(text ?? "").cleaned.trim();
-}
-
-function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string {
-  const updatedBody = params.updatedBody?.trim();
-  if (!updatedBody) {
-    return "";
-  }
-  const originalBody = params.originalBody?.trim();
-  if (!originalBody) {
-    return updatedBody;
-  }
-
-  const cleanedOriginalBody = stripInlineDirectives(originalBody);
-  if (!cleanedOriginalBody) {
-    return updatedBody;
-  }
-  if (updatedBody === originalBody) {
-    return cleanedOriginalBody;
-  }
-  return (
-    replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody
-  ).trim();
-}
-
-function rebuildQueuedPromptWithMediaUnderstanding(params: {
-  prompt: string;
-  originalBody?: string;
-  updatedBody?: string;
-  mediaNote?: string;
-}): string {
-  let stripped = stripLeadingMediaAttachedLines(params.prompt);
-  if (!params.mediaNote) {
-    stripped = stripLeadingMediaReplyHint(stripped);
-  }
-
-  const updatedBody = normalizeUpdatedBody({
-    originalBody: params.originalBody,
-    updatedBody: params.updatedBody,
-  });
-  if (!updatedBody) {
-    return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
-  }
-
-  const replacementTargets = [
-    params.originalBody?.trim(),
-    stripInlineDirectives(params.originalBody),
-    MEDIA_ONLY_PLACEHOLDER,
-  ].filter(
-    (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
-  );
-
-  let rebuilt = stripped;
-  for (const target of replacementTargets) {
-    const replaced = replaceLastOccurrence(rebuilt, target, updatedBody);
-    if (replaced !== undefined) {
-      rebuilt = replaced;
-      return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
-    }
-  }
-
-  rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n");
-  return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
-}
-
 export function createFollowupRunner(params: {
   opts?: GetReplyOptions;
   typing: TypingController;
@@ -264,72 +155,7 @@ export function createFollowupRunner(params: {
       let bootstrapPromptWarningSignaturesSeen = resolveBootstrapWarningSignaturesSeen(
         activeSessionEntry?.systemPromptReport,
       );
-
-      // Apply media understanding for followup-queued messages when it was
-      // not applied (or failed) in the primary path.  This ensures voice
-      // notes that arrived while the agent was mid-turn still get transcribed.
-      if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) {
-        const hasMedia = Boolean(
-          queued.mediaContext.MediaPath?.trim() ||
-          queued.mediaContext.MediaUrl?.trim() ||
-          (Array.isArray(queued.mediaContext.MediaPaths) &&
-            queued.mediaContext.MediaPaths.length > 0) ||
-          (Array.isArray(queued.mediaContext.MediaUrls) &&
-            queued.mediaContext.MediaUrls.length > 0),
-        );
-        if (hasMedia) {
-          try {
-            const resolvedOriginalBody =
-              queued.mediaContext.CommandBody ??
-              queued.mediaContext.RawBody ??
-              queued.mediaContext.Body;
-            const mediaCtx = {
-              ...queued.mediaContext,
-              Body: resolvedOriginalBody,
-            } as MsgContext;
-            const originalBody = resolvedOriginalBody;
-            // Capture whether the resolved body already contains a file block
-            // BEFORE applyMediaUnderstanding mutates it — this detects prior
-            // extraction so we avoid double-inserting.  Checking the body
-            // (not the full queued.prompt) avoids false positives from user
-            // messages that happen to contain literal "<file path=" text.
-            const bodyAlreadyHasFileBlock = FILE_BLOCK_RE.test(resolvedOriginalBody ?? "");
-            const muResult = await applyMediaUnderstanding({
-              ctx: mediaCtx,
-              cfg: queued.run.config,
-              agentDir: queued.run.agentDir,
-              activeModel: {
-                provider: queued.run.provider,
-                model: queued.run.model,
-              },
-            });
-            const shouldRebuildPrompt =
-              muResult.outputs.length > 0 ||
-              muResult.appliedAudio ||
-              muResult.appliedImage ||
-              muResult.appliedVideo ||
-              (muResult.appliedFile && !bodyAlreadyHasFileBlock);
-            if (shouldRebuildPrompt) {
-              // Rebuild the queued prompt from the mutated media context so the
-              // deferred path matches the primary path's prompt shape.
-              const newMediaNote = buildInboundMediaNote(mediaCtx);
-              queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({
-                prompt: queued.prompt,
-                originalBody,
-                updatedBody: mediaCtx.Body,
-                mediaNote: newMediaNote,
-              });
-              logVerbose(
-                `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
-              );
-            }
-          } catch (err) {
-            logVerbose(
-              `followup: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`,
-            );
-          }
-        }
-      }
+      await applyDeferredMediaUnderstandingToQueuedRun(queued, { logLabel: "followup" });
 
       try {
         const fallbackResult = await runWithModelFallback({
diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts
index 32999a66238..92ca1be7513 100644
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -490,6 +490,8 @@ export async function runPreparedReply(
         Body: ctx.Body,
         CommandBody: ctx.CommandBody,
         RawBody: ctx.RawBody,
+        Provider: ctx.Provider ?? sessionCtx.Provider,
+        Surface: ctx.Surface ?? sessionCtx.Surface,
         MediaPath: ctx.MediaPath,
         MediaUrl: ctx.MediaUrl,
         MediaType: ctx.MediaType,
diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts
index 1e2fb33e4e0..68c660fe2b8 100644
--- a/src/auto-reply/reply/queue/drain.ts
+++ b/src/auto-reply/reply/queue/drain.ts
@@ -3,15 +3,17 @@ import { resolveGlobalMap } from "../../../shared/global-singleton.js";
 import {
   buildCollectPrompt,
   beginQueueDrain,
+  buildQueueSummaryLine,
+  buildQueueSummaryPrompt,
   clearQueueSummaryState,
   drainCollectQueueStep,
   drainNextQueueItem,
   hasCrossChannelItems,
-  previewQueueSummaryPrompt,
   waitForQueueDebounce,
 } from "../../../utils/queue-helpers.js";
+import { applyDeferredMediaUnderstandingToQueuedRun } from "../followup-media.js";
 import { isRoutableChannel } from "../route-reply.js";
-import { FOLLOWUP_QUEUES } from "./state.js";
+import { FOLLOWUP_QUEUES, type FollowupQueueState } from "./state.js";
 import type { FollowupRun } from "./types.js";
 
 // Persists the most recent runFollowup callback per queue key so that
@@ -68,6 +70,50 @@ function resolveCrossChannelKey(item: FollowupRun): { cross?: true; key?: string
   };
 }
 
+function clearFollowupQueueSummaryState(queue: FollowupQueueState): void {
+  clearQueueSummaryState(queue);
+  queue.summaryItems = [];
+}
+
+export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Promise<void> {
+  for (const item of items) {
+    await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" });
+  }
+}
+
+async function resolveSummaryLines(items: FollowupRun[]): Promise<string[]> {
+  const summaryLines: string[] = [];
+  for (const item of items) {
+    await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" });
+    summaryLines.push(buildQueueSummaryLine(item.summaryLine?.trim() || item.prompt.trim()));
+  }
+  return summaryLines;
+}
+
+export async function buildMediaAwareQueueSummaryPrompt(params: {
+  dropPolicy: FollowupQueueState["dropPolicy"];
+  droppedCount: number;
+  summaryLines: string[];
+  summaryItems: FollowupRun[];
+  noun: string;
+}): Promise<string | undefined> {
+  if (params.dropPolicy !== "summarize" || params.droppedCount <= 0) {
+    return undefined;
+  }
+  const summaryLines =
+    params.summaryItems.length > 0
+      ? await resolveSummaryLines(params.summaryItems)
+      : params.summaryLines;
+  return buildQueueSummaryPrompt({
+    state: {
+      dropPolicy: params.dropPolicy,
+      droppedCount: params.droppedCount,
+      summaryLines: [...summaryLines],
+    },
+    noun: params.noun,
+  });
+}
+
 export function scheduleFollowupDrain(
   key: string,
   runFollowup: (run: FollowupRun) => Promise<void>,
@@ -107,7 +153,14 @@ export function scheduleFollowupDrain(
           }
 
           const items = queue.items.slice();
-          const summary = previewQueueSummaryPrompt({ state: queue, noun: "message" });
+          await applyDeferredMediaToQueuedRuns(items);
+          const summary = await buildMediaAwareQueueSummaryPrompt({
+            dropPolicy: queue.dropPolicy,
+            droppedCount: queue.droppedCount,
+            summaryLines: queue.summaryLines,
+            summaryItems: queue.summaryItems,
+            noun: "message",
+          });
           const run = items.at(-1)?.run ?? queue.lastRun;
           if (!run) {
             break;
@@ -129,12 +182,18 @@ export function scheduleFollowupDrain(
           });
           queue.items.splice(0, items.length);
           if (summary) {
-            clearQueueSummaryState(queue);
+            clearFollowupQueueSummaryState(queue);
           }
           continue;
         }
 
-        const summaryPrompt = previewQueueSummaryPrompt({ state: queue, noun: "message" });
+        const summaryPrompt = await buildMediaAwareQueueSummaryPrompt({
+          dropPolicy: queue.dropPolicy,
+          droppedCount: queue.droppedCount,
+          summaryLines: queue.summaryLines,
+          summaryItems: queue.summaryItems,
+          noun: "message",
+        });
         if (summaryPrompt) {
           const run = queue.lastRun;
           if (!run) {
@@ -155,7 +214,7 @@ export function scheduleFollowupDrain(
           ) {
             break;
           }
-          clearQueueSummaryState(queue);
+          clearFollowupQueueSummaryState(queue);
           continue;
         }
 
diff --git a/src/auto-reply/reply/queue/enqueue.ts b/src/auto-reply/reply/queue/enqueue.ts
index 11da0db98fc..e58cc5ffac5 100644
--- a/src/auto-reply/reply/queue/enqueue.ts
+++ b/src/auto-reply/reply/queue/enqueue.ts
@@ -1,8 +1,8 @@
 import { createDedupeCache } from "../../../infra/dedupe.js";
 import { resolveGlobalSingleton } from "../../../shared/global-singleton.js";
-import { applyQueueDropPolicy, shouldSkipQueueItem } from "../../../utils/queue-helpers.js";
+import { buildQueueSummaryLine, shouldSkipQueueItem } from "../../../utils/queue-helpers.js";
 import { kickFollowupDrainIfIdle } from "./drain.js";
-import { getExistingFollowupQueue, getFollowupQueue } from "./state.js";
+import { getExistingFollowupQueue, getFollowupQueue, type FollowupQueueState } from "./state.js";
 import type { FollowupRun, QueueDedupeMode, QueueSettings } from "./types.js";
 
 /**
@@ -57,6 +57,34 @@ function isRunAlreadyQueued(
   return items.some((item) => item.prompt === run.prompt && hasSameRouting(item));
 }
 
+function applyFollowupQueueDropPolicy(queue: FollowupQueueState): boolean {
+  const cap = queue.cap;
+  if (cap <= 0 || queue.items.length < cap) {
+    return true;
+  }
+  if (queue.dropPolicy === "new") {
+    return false;
+  }
+
+  const dropCount = queue.items.length - cap + 1;
+  const dropped = queue.items.splice(0, dropCount);
+  if (queue.dropPolicy === "summarize") {
+    for (const item of dropped) {
+      queue.droppedCount += 1;
+      queue.summaryItems.push(item);
+      queue.summaryLines.push(
+        buildQueueSummaryLine(item.summaryLine?.trim() || item.prompt.trim()),
+      );
+    }
+    const limit = Math.max(0, cap);
+    while (queue.summaryLines.length > limit) {
+      queue.summaryLines.shift();
+      queue.summaryItems.shift();
+    }
+  }
+  return true;
+}
+
 export function enqueueFollowupRun(
   key: string,
   run: FollowupRun,
@@ -83,10 +111,7 @@ export function enqueueFollowupRun(
   queue.lastEnqueuedAt = Date.now();
   queue.lastRun = run.run;
 
-  const shouldEnqueue = applyQueueDropPolicy({
-    queue,
-    summarize: (item) => item.summaryLine?.trim() || item.prompt.trim(),
-  });
+  const shouldEnqueue = applyFollowupQueueDropPolicy(queue);
   if (!shouldEnqueue) {
     return false;
   }
diff --git a/src/auto-reply/reply/queue/state.ts b/src/auto-reply/reply/queue/state.ts
index 44208e727dd..94021dd0c4c 100644
--- a/src/auto-reply/reply/queue/state.ts
+++ b/src/auto-reply/reply/queue/state.ts
@@ -4,6 +4,7 @@ import type { FollowupRun, QueueDropPolicy, QueueMode, QueueSettings } from "./t
 
 export type FollowupQueueState = {
   items: FollowupRun[];
+  summaryItems: FollowupRun[];
   draining: boolean;
   lastEnqueuedAt: number;
   mode: QueueMode;
@@ -47,6 +48,7 @@ export function getFollowupQueue(key: string, settings: QueueSettings): Followup
 
   const created: FollowupQueueState = {
     items: [],
+    summaryItems: [],
     draining: false,
     lastEnqueuedAt: 0,
     mode: settings.mode,
@@ -78,6 +80,7 @@ export function clearFollowupQueue(key: string): number {
   }
   const cleared = queue.items.length + queue.droppedCount;
   queue.items.length = 0;
+  queue.summaryItems.length = 0;
   queue.droppedCount = 0;
   queue.summaryLines = [];
   queue.lastRun = undefined;
diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts
index 637ceb0a149..291059a28d7 100644
--- a/src/auto-reply/reply/queue/types.ts
+++ b/src/auto-reply/reply/queue/types.ts
@@ -32,6 +32,8 @@ export type FollowupMediaContext = {
   Body?: string;
   CommandBody?: string;
   RawBody?: string;
+  Provider?: string;
+  Surface?: string;
   MediaPath?: string;
   MediaUrl?: string;
   MediaType?: string;
@@ -47,6 +49,7 @@ export type FollowupMediaContext = {
   OriginatingTo?: string;
   AccountId?: string;
   MessageThreadId?: string | number;
+  DeferredMediaApplied?: boolean;
 };
 
 export type FollowupRun = {

From 48b22eecd0dd7fb50bba83ced450981d16db253f Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 23:03:20 -0400
Subject: [PATCH 07/16] fix: set DeferredMediaApplied on error and strip old
 file blocks on rebuild

---
 src/auto-reply/reply/followup-media.ts       |  15 +++
 src/auto-reply/reply/followup-runner.test.ts | 111 +++++++++++++++++++
 2 files changed, 126 insertions(+)

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index 5a014e63f9b..5340d0df99a 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -13,6 +13,11 @@ const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
 const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
 const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
 const FILE_BLOCK_RE = /<file\s+name="/i;
+const FILE_BLOCK_FULL_RE = /<file\s+name="[^"]*"[^>]*>[\s\S]*?<\/file>\n?/gi;
+
+function stripExistingFileBlocks(text: string): string {
+  return text.replace(FILE_BLOCK_FULL_RE, "").trim();
+}
 
 function stripLeadingMediaAttachedLines(prompt: string): string {
   const lines = prompt.split("\n");
@@ -87,6 +92,15 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
     stripped = stripLeadingMediaReplyHint(stripped);
   }
 
+  // Strip pre-existing file blocks from the prompt when the updated body
+  // contains new file blocks.  Mixed messages (audio + PDF) can arrive with
+  // file extraction already applied in the primary path; without this strip
+  // the old block stays in the prompt while the updated body adds a new one,
+  // duplicating potentially large file payloads.
+  if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) {
+    stripped = stripExistingFileBlocks(stripped);
+  }
+
   const updatedBody = normalizeUpdatedBody({
     originalBody: params.originalBody,
     updatedBody: params.updatedBody,
@@ -234,6 +248,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
       updatedBody: shouldRebuildPrompt ? mediaCtx.Body : undefined,
     });
   } catch (err) {
+    mediaContext.DeferredMediaApplied = true;
     logVerbose(
       `${params.logLabel ?? "followup"}: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`,
     );
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index ea7448b74e5..3ef621c4a4b 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -1747,6 +1747,117 @@ describe("createFollowupRunner media understanding", () => {
     expect(matches?.length).toBe(1);
   });
 
+  it("does not duplicate file blocks for mixed audio+file messages re-processed in followup", async () => {
+    const existingFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nold extracted content\n</file>';
+    const newFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nnew extracted content\n</file>';
+    const transcriptText = "Mixed message transcript";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nanalyze this\n\n${newFileBlock}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // Simulate a mixed message where the primary path already extracted the
+    // PDF (file block is in the prompt) but audio transcription failed.
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nanalyze this\n\n${existingFileBlock}`,
+        mediaContext: {
+          Body: `analyze this\n\n${existingFileBlock}`,
+          CommandBody: "analyze this",
+          RawBody: "analyze this",
+          MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"],
+          MediaTypes: ["audio/ogg", "application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    // Should contain the transcript
+    expect(agentCall?.prompt).toContain(transcriptText);
+    // Should have exactly one file block (the new one), not two
+    expect(agentCall?.prompt?.match(/<file\s+name="report\.pdf"/g)).toHaveLength(1);
+    expect(agentCall?.prompt).toContain("new extracted content");
+    expect(agentCall?.prompt).not.toContain("old extracted content");
+  });
+
+  it("sets DeferredMediaApplied when media understanding throws", async () => {
+    applyMediaUnderstandingMock.mockRejectedValueOnce(
+      new Error("transcription service unavailable"),
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "fallback reply" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text",
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+      },
+    });
+
+    await runner(queued);
+
+    // DeferredMediaApplied should be set so re-runs don't retry
+    expect(queued.mediaContext?.DeferredMediaApplied).toBe(true);
+
+    // The agent should still be called with the raw prompt
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("some text");
+  });
+
   it("does not re-apply file extraction when the stored media body already has a file block", async () => {
     const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
     runEmbeddedPiAgentMock.mockResolvedValueOnce({

From 88ac4bb23800efc09e492b0400dfd787773fed37 Mon Sep 17 00:00:00 2001
From: Joseph Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 23:26:27 -0400
Subject: [PATCH 08/16] Update src/auto-reply/reply/followup-runner.test.ts

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 src/auto-reply/reply/followup-runner.test.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 3ef621c4a4b..bae9fa98391 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -1589,7 +1589,7 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example.");
   });
 
-  it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
+  it("rebuilds the prompt when image understanding mutates the body alongside existing text", async () => {
     applyMediaUnderstandingMock.mockImplementationOnce(
       async (params: { ctx: Record<string, unknown> }) => {
         params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items.";

From 4c56672e974c271318d4a245c2edefcb3bfed362 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 23:51:22 -0400
Subject: [PATCH 09/16] fix: address remaining review feedback on followup
 media

- Scope file-block stripping to body region only, preserving file blocks
  in quoted/replied text and thread history above the body
- Gate file-extraction skip on mutation evidence (Body differs from
  resolved original) instead of raw '<file name=' text matching to avoid
  false-positives on user messages containing literal XML
- Document collect-mode scope limitation in applyDeferredMediaUnderstandingToQueuedRun
- Rename duplicate test description to distinguish body-alongside-text case
- Prefer updated prompt over original summaryLine in overflow summaries
  so captioned voice note transcripts are surfaced
---
 src/auto-reply/reply/followup-media.ts | 26 ++++++++++++++++++++------
 src/auto-reply/reply/queue/drain.ts    |  4 +++-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index 5340d0df99a..61d3b9f9cfb 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -92,13 +92,21 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
     stripped = stripLeadingMediaReplyHint(stripped);
   }
 
-  // Strip pre-existing file blocks from the prompt when the updated body
+  // Strip pre-existing file blocks from the body region when the updated body
   // contains new file blocks.  Mixed messages (audio + PDF) can arrive with
   // file extraction already applied in the primary path; without this strip
   // the old block stays in the prompt while the updated body adds a new one,
   // duplicating potentially large file payloads.
+  // Scope stripping to the body segment so quoted/replied text and thread
+  // history above the body retain any legitimate <file> blocks.
   if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) {
-    stripped = stripExistingFileBlocks(stripped);
+    const bodyTarget = params.originalBody?.trim();
+    const bodyIdx = bodyTarget ? stripped.lastIndexOf(bodyTarget) : -1;
+    if (bodyIdx >= 0) {
+      stripped = stripped.slice(0, bodyIdx) + stripExistingFileBlocks(stripped.slice(bodyIdx));
+    } else {
+      stripped = stripExistingFileBlocks(stripped);
+    }
   }
 
   const updatedBody = normalizeUpdatedBody({
@@ -176,6 +184,9 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
   queued: FollowupRun,
   params: { logLabel?: string } = {},
 ): Promise<void> {
+  // NOTE: collect-mode and overflow-summary queue drains create synthetic
+  // followup runs without mediaContext — those paths are not covered here
+  // and rely on their own prompt-building logic in queue/drain.ts.
   const mediaContext = queued.mediaContext;
   if (!mediaContext || mediaContext.DeferredMediaApplied) {
     return;
@@ -191,10 +202,13 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
 
   const resolvedOriginalBody =
     mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body;
-  const bodyAlreadyHasFileBlock =
-    FILE_BLOCK_RE.test(resolvedOriginalBody ?? "") || FILE_BLOCK_RE.test(mediaContext.Body ?? "");
+  // Detect file extraction from the primary path by checking whether Body was
+  // mutated AND contains file blocks.  This avoids false-positives when a user
+  // message literally contains "<file name=" text.
+  const fileExtractionAlreadyApplied =
+    mediaContext.Body !== resolvedOriginalBody && FILE_BLOCK_RE.test(mediaContext.Body ?? "");
 
-  if (bodyAlreadyHasFileBlock && hasOnlyFileLikeAttachments(mediaContext)) {
+  if (fileExtractionAlreadyApplied && hasOnlyFileLikeAttachments(mediaContext)) {
     mediaContext.DeferredMediaApplied = true;
     return;
   }
@@ -227,7 +241,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
       muResult.appliedAudio ||
       muResult.appliedImage ||
       muResult.appliedVideo ||
-      (muResult.appliedFile && !bodyAlreadyHasFileBlock);
+      (muResult.appliedFile && !fileExtractionAlreadyApplied);
 
     if (shouldRebuildPrompt) {
       const newMediaNote = buildInboundMediaNote(mediaCtx);
diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts
index 68c660fe2b8..471c94200ba 100644
--- a/src/auto-reply/reply/queue/drain.ts
+++ b/src/auto-reply/reply/queue/drain.ts
@@ -85,7 +85,9 @@ async function resolveSummaryLines(items: FollowupRun[]): Promise<string[]> {
   const summaryLines: string[] = [];
   for (const item of items) {
     await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" });
-    summaryLines.push(buildQueueSummaryLine(item.summaryLine?.trim() || item.prompt.trim()));
+    // After deferred media, prefer the updated prompt (which includes transcripts)
+    // over the original summaryLine (which may just be the caption text).
+    summaryLines.push(buildQueueSummaryLine(item.prompt.trim() || item.summaryLine?.trim() || ""));
   }
   return summaryLines;
 }

From 8b6dbfb7a1151b49b283f205f8e0a7663d04dabd Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sat, 14 Mar 2026 23:52:08 -0400
Subject: [PATCH 10/16] chore: format python scripts

---
 ...ck-composite-action-input-interpolation.py |  1 -
 skills/model-usage/scripts/model_usage.py     | 52 ++++++++--------
 .../nano-banana-pro/scripts/generate_image.py | 59 ++++++++-----------
 skills/openai-image-gen/scripts/gen.py        | 37 ++++++------
 skills/openai-image-gen/scripts/test_gen.py   |  2 +
 .../skill-creator/scripts/quick_validate.py   | 18 ++----
 6 files changed, 75 insertions(+), 94 deletions(-)

diff --git a/scripts/check-composite-action-input-interpolation.py b/scripts/check-composite-action-input-interpolation.py
index 18c52501b85..5c852a7f323 100644
--- a/scripts/check-composite-action-input-interpolation.py
+++ b/scripts/check-composite-action-input-interpolation.py
@@ -5,7 +5,6 @@ import pathlib
 import re
 import sys
 
-
 INPUT_INTERPOLATION_RE = re.compile(r"\$\{\{\s*inputs\.")
 RUN_LINE_RE = re.compile(r"^(\s*)run:\s*(.*)$")
 USING_COMPOSITE_RE = re.compile(r"^\s*using:\s*composite\s*$", re.MULTILINE)
diff --git a/skills/model-usage/scripts/model_usage.py b/skills/model-usage/scripts/model_usage.py
index ea28fa0d983..7688e4e46b2 100644
--- a/skills/model-usage/scripts/model_usage.py
+++ b/skills/model-usage/scripts/model_usage.py
@@ -9,12 +9,12 @@ from __future__ import annotations
 
 import argparse
 import json
-import os
 import subprocess
 import sys
+from collections.abc import Iterable
 from dataclasses import dataclass
 from datetime import date, datetime, timedelta
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any
 
 
 def positive_int(value: str) -> int:
@@ -31,7 +31,7 @@ def eprint(msg: str) -> None:
     print(msg, file=sys.stderr)
 
 
-def run_codexbar_cost(provider: str) -> List[Dict[str, Any]]:
+def run_codexbar_cost(provider: str) -> list[dict[str, Any]]:
     cmd = ["codexbar", "cost", "--format", "json", "--provider", provider]
     try:
         output = subprocess.check_output(cmd, text=True)
@@ -48,12 +48,12 @@ def run_codexbar_cost(provider: str) -> List[Dict[str, Any]]:
     return payload
 
 
-def load_payload(input_path: Optional[str], provider: str) -> Dict[str, Any]:
+def load_payload(input_path: str | None, provider: str) -> dict[str, Any]:
     if input_path:
         if input_path == "-":
             raw = sys.stdin.read()
         else:
-            with open(input_path, "r", encoding="utf-8") as handle:
+            with open(input_path, encoding="utf-8") as handle:
                 raw = handle.read()
         data = json.loads(raw)
     else:
@@ -77,7 +77,7 @@ class ModelCost:
     cost: float
 
 
-def parse_daily_entries(payload: Dict[str, Any]) -> List[Dict[str, Any]]:
+def parse_daily_entries(payload: dict[str, Any]) -> list[dict[str, Any]]:
     daily = payload.get("daily")
     if not daily:
         return []
@@ -86,18 +86,18 @@ def parse_daily_entries(payload: Dict[str, Any]) -> List[Dict[str, Any]]:
     return [entry for entry in daily if isinstance(entry, dict)]
 
 
-def parse_date(value: str) -> Optional[date]:
+def parse_date(value: str) -> date | None:
     try:
         return datetime.strptime(value, "%Y-%m-%d").date()
     except Exception:
         return None
 
 
-def filter_by_days(entries: List[Dict[str, Any]], days: Optional[int]) -> List[Dict[str, Any]]:
+def filter_by_days(entries: list[dict[str, Any]], days: int | None) -> list[dict[str, Any]]:
     if not days:
         return entries
     cutoff = date.today() - timedelta(days=days - 1)
-    filtered: List[Dict[str, Any]] = []
+    filtered: list[dict[str, Any]] = []
     for entry in entries:
         day = entry.get("date")
         if not isinstance(day, str):
@@ -108,8 +108,8 @@ def filter_by_days(entries: List[Dict[str, Any]], days: Optional[int]) -> List[D
     return filtered
 
 
-def aggregate_costs(entries: Iterable[Dict[str, Any]]) -> Dict[str, float]:
-    totals: Dict[str, float] = {}
+def aggregate_costs(entries: Iterable[dict[str, Any]]) -> dict[str, float]:
+    totals: dict[str, float] = {}
     for entry in entries:
         breakdowns = entry.get("modelBreakdowns")
         if not breakdowns:
@@ -129,7 +129,7 @@ def aggregate_costs(entries: Iterable[Dict[str, Any]]) -> Dict[str, float]:
     return totals
 
 
-def pick_current_model(entries: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
+def pick_current_model(entries: list[dict[str, Any]]) -> tuple[str | None, str | None]:
     if not entries:
         return None, None
     sorted_entries = sorted(
@@ -139,7 +139,7 @@ def pick_current_model(entries: List[Dict[str, Any]]) -> Tuple[Optional[str], Op
     for entry in reversed(sorted_entries):
         breakdowns = entry.get("modelBreakdowns")
         if isinstance(breakdowns, list) and breakdowns:
-            scored: List[ModelCost] = []
+            scored: list[ModelCost] = []
             for item in breakdowns:
                 if not isinstance(item, dict):
                     continue
@@ -158,13 +158,13 @@ def pick_current_model(entries: List[Dict[str, Any]]) -> Tuple[Optional[str], Op
     return None, None
 
 
-def usd(value: Optional[float]) -> str:
+def usd(value: float | None) -> str:
     if value is None:
         return "—"
     return f"${value:,.2f}"
 
 
-def latest_day_cost(entries: List[Dict[str, Any]], model: str) -> Tuple[Optional[str], Optional[float]]:
+def latest_day_cost(entries: list[dict[str, Any]], model: str) -> tuple[str | None, float | None]:
     if not entries:
         return None, None
     sorted_entries = sorted(
@@ -188,10 +188,10 @@ def latest_day_cost(entries: List[Dict[str, Any]], model: str) -> Tuple[Optional
 def render_text_current(
     provider: str,
     model: str,
-    latest_date: Optional[str],
-    total_cost: Optional[float],
-    latest_cost: Optional[float],
-    latest_cost_date: Optional[str],
+    latest_date: str | None,
+    total_cost: float | None,
+    latest_cost: float | None,
+    latest_cost_date: str | None,
     entry_count: int,
 ) -> str:
     lines = [f"Provider: {provider}", f"Current model: {model}"]
@@ -204,7 +204,7 @@ def render_text_current(
     return "\n".join(lines)
 
 
-def render_text_all(provider: str, totals: Dict[str, float]) -> str:
+def render_text_all(provider: str, totals: dict[str, float]) -> str:
     lines = [f"Provider: {provider}", "Models:"]
     for model, cost in sorted(totals.items(), key=lambda item: item[1], reverse=True):
         lines.append(f"- {model}: {usd(cost)}")
@@ -214,12 +214,12 @@ def render_text_all(provider: str, totals: Dict[str, float]) -> str:
 def build_json_current(
     provider: str,
     model: str,
-    latest_date: Optional[str],
-    total_cost: Optional[float],
-    latest_cost: Optional[float],
-    latest_cost_date: Optional[str],
+    latest_date: str | None,
+    total_cost: float | None,
+    latest_cost: float | None,
+    latest_cost_date: str | None,
     entry_count: int,
-) -> Dict[str, Any]:
+) -> dict[str, Any]:
     return {
         "provider": provider,
         "mode": "current",
@@ -232,7 +232,7 @@ def build_json_current(
     }
 
 
-def build_json_all(provider: str, totals: Dict[str, float]) -> Dict[str, Any]:
+def build_json_all(provider: str, totals: dict[str, float]) -> dict[str, Any]:
     return {
         "provider": provider,
         "mode": "all",
diff --git a/skills/nano-banana-pro/scripts/generate_image.py b/skills/nano-banana-pro/scripts/generate_image.py
index 796022adfba..92efabc2b0f 100755
--- a/skills/nano-banana-pro/scripts/generate_image.py
+++ b/skills/nano-banana-pro/scripts/generate_image.py
@@ -70,42 +70,32 @@ def choose_output_resolution(
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Generate images using Nano Banana Pro (Gemini 3 Pro Image)"
-    )
+    parser = argparse.ArgumentParser(description="Generate images using Nano Banana Pro (Gemini 3 Pro Image)")
+    parser.add_argument("--prompt", "-p", required=True, help="Image description/prompt")
+    parser.add_argument("--filename", "-f", required=True, help="Output filename (e.g., sunset-mountains.png)")
     parser.add_argument(
-        "--prompt", "-p",
-        required=True,
-        help="Image description/prompt"
-    )
-    parser.add_argument(
-        "--filename", "-f",
-        required=True,
-        help="Output filename (e.g., sunset-mountains.png)"
-    )
-    parser.add_argument(
-        "--input-image", "-i",
+        "--input-image",
+        "-i",
         action="append",
         dest="input_images",
         metavar="IMAGE",
-        help="Input image path(s) for editing/composition. Can be specified multiple times (up to 14 images)."
+        help="Input image path(s) for editing/composition. Can be specified multiple times (up to 14 images).",
     )
     parser.add_argument(
-        "--resolution", "-r",
+        "--resolution",
+        "-r",
         choices=["1K", "2K", "4K"],
         default=None,
-        help="Output resolution: 1K, 2K, or 4K. If omitted with input images, auto-detect from largest image dimension."
+        help="Output resolution: 1K, 2K, or 4K. If omitted with input images, auto-detect from largest image dimension.",
     )
     parser.add_argument(
-        "--aspect-ratio", "-a",
+        "--aspect-ratio",
+        "-a",
         choices=SUPPORTED_ASPECT_RATIOS,
         default=None,
-        help=f"Output aspect ratio (default: model decides). Options: {', '.join(SUPPORTED_ASPECT_RATIOS)}"
-    )
-    parser.add_argument(
-        "--api-key", "-k",
-        help="Gemini API key (overrides GEMINI_API_KEY env var)"
+        help=f"Output aspect ratio (default: model decides). Options: {', '.join(SUPPORTED_ASPECT_RATIOS)}",
     )
+    parser.add_argument("--api-key", "-k", help="Gemini API key (overrides GEMINI_API_KEY env var)")
 
     args = parser.parse_args()
 
@@ -158,10 +148,7 @@ def main():
         has_input_images=bool(input_images),
     )
     if auto_detected:
-        print(
-            f"Auto-detected resolution: {output_resolution} "
-            f"(from max input dimension {max_input_dim})"
-        )
+        print(f"Auto-detected resolution: {output_resolution} (from max input dimension {max_input_dim})")
 
     # Build contents (images first if editing, prompt only if generating)
     if input_images:
@@ -182,9 +169,8 @@ def main():
             model="gemini-3-pro-image-preview",
             contents=contents,
             config=types.GenerateContentConfig(
-                response_modalities=["TEXT", "IMAGE"],
-                image_config=types.ImageConfig(**image_cfg_kwargs)
-            )
+                response_modalities=["TEXT", "IMAGE"], image_config=types.ImageConfig(**image_cfg_kwargs)
+            ),
         )
 
         # Process response and convert to PNG
@@ -201,19 +187,20 @@ def main():
                 if isinstance(image_data, str):
                     # If it's a string, it might be base64
                     import base64
+
                     image_data = base64.b64decode(image_data)
 
                 image = PILImage.open(BytesIO(image_data))
 
                 # Ensure RGB mode for PNG (convert RGBA to RGB with white background if needed)
-                if image.mode == 'RGBA':
-                    rgb_image = PILImage.new('RGB', image.size, (255, 255, 255))
+                if image.mode == "RGBA":
+                    rgb_image = PILImage.new("RGB", image.size, (255, 255, 255))
                     rgb_image.paste(image, mask=image.split()[3])
-                    rgb_image.save(str(output_path), 'PNG')
-                elif image.mode == 'RGB':
-                    image.save(str(output_path), 'PNG')
+                    rgb_image.save(str(output_path), "PNG")
+                elif image.mode == "RGB":
+                    image.save(str(output_path), "PNG")
                 else:
-                    image.convert('RGB').save(str(output_path), 'PNG')
+                    image.convert("RGB").save(str(output_path), "PNG")
                 image_saved = True
 
         if image_saved:
diff --git a/skills/openai-image-gen/scripts/gen.py b/skills/openai-image-gen/scripts/gen.py
index 2d8c7569016..1fd61f0f7a3 100644
--- a/skills/openai-image-gen/scripts/gen.py
+++ b/skills/openai-image-gen/scripts/gen.py
@@ -58,9 +58,7 @@ def pick_prompts(count: int) -> list[str]:
     ]
     prompts: list[str] = []
     for _ in range(count):
-        prompts.append(
-            f"{random.choice(styles)} of {random.choice(subjects)}, {random.choice(lighting)}"
-        )
+        prompts.append(f"{random.choice(styles)} of {random.choice(subjects)}, {random.choice(lighting)}")
     return prompts
 
 
@@ -100,9 +98,7 @@ def normalize_optional_flag(
         value = aliases.get(value, value)
 
     if value not in allowed:
-        raise ValueError(
-            f"Invalid --{flag_name} '{raw_value}'. Allowed values: {allowed_text}."
-        )
+        raise ValueError(f"Invalid --{flag_name} '{raw_value}'. Allowed values: {allowed_text}.")
     return value
 
 
@@ -115,10 +111,7 @@ def normalize_background(model: str, background: str) -> str:
         supported=lambda candidate: candidate.startswith("gpt-image"),
         allowed={"transparent", "opaque", "auto"},
         allowed_text="transparent, opaque, auto",
-        unsupported_message=(
-            "Warning: --background is only supported for gpt-image models; "
-            "ignoring for '{model}'."
-        ),
+        unsupported_message=("Warning: --background is only supported for gpt-image models; ignoring for '{model}'."),
     )
 
 
@@ -131,9 +124,7 @@ def normalize_style(model: str, style: str) -> str:
         supported=lambda candidate: candidate == "dall-e-3",
         allowed={"vivid", "natural"},
         allowed_text="vivid, natural",
-        unsupported_message=(
-            "Warning: --style is only supported for dall-e-3; ignoring for '{model}'."
-        ),
+        unsupported_message=("Warning: --style is only supported for dall-e-3; ignoring for '{model}'."),
     )
 
 
@@ -147,8 +138,7 @@ def normalize_output_format(model: str, output_format: str) -> str:
         allowed={"png", "jpeg", "webp"},
         allowed_text="png, jpeg, webp",
         unsupported_message=(
-            "Warning: --output-format is only supported for gpt-image models; "
-            "ignoring for '{model}'."
+            "Warning: --output-format is only supported for gpt-image models; ignoring for '{model}'."
         ),
         aliases={"jpg": "jpeg"},
     )
@@ -245,9 +235,15 @@ def main() -> int:
     ap.add_argument("--prompt", help="Single prompt. If omitted, random prompts are generated.")
     ap.add_argument("--count", type=int, default=8, help="How many images to generate.")
     ap.add_argument("--model", default="gpt-image-1", help="Image model id.")
-    ap.add_argument("--size", default="", help="Image size (e.g. 1024x1024, 1536x1024). Defaults based on model if not specified.")
-    ap.add_argument("--quality", default="", help="Image quality (e.g. high, standard). Defaults based on model if not specified.")
-    ap.add_argument("--background", default="", help="Background transparency (GPT models only): transparent, opaque, or auto.")
+    ap.add_argument(
+        "--size", default="", help="Image size (e.g. 1024x1024, 1536x1024). Defaults based on model if not specified."
+    )
+    ap.add_argument(
+        "--quality", default="", help="Image quality (e.g. high, standard). Defaults based on model if not specified."
+    )
+    ap.add_argument(
+        "--background", default="", help="Background transparency (GPT models only): transparent, opaque, or auto."
+    )
     ap.add_argument("--output-format", default="", help="Output format (GPT models only): png, jpeg, or webp.")
     ap.add_argument("--style", default="", help="Image style (dall-e-3 only): vivid or natural.")
     ap.add_argument("--out-dir", default="", help="Output directory (default: ./tmp/openai-image-gen-<ts>).")
@@ -265,7 +261,10 @@ def main() -> int:
 
     count = args.count
     if args.model == "dall-e-3" and count > 1:
-        print(f"Warning: dall-e-3 only supports generating 1 image at a time. Reducing count from {count} to 1.", file=sys.stderr)
+        print(
+            f"Warning: dall-e-3 only supports generating 1 image at a time. Reducing count from {count} to 1.",
+            file=sys.stderr,
+        )
         count = 1
 
     out_dir = Path(args.out_dir).expanduser() if args.out_dir else default_out_dir()
diff --git a/skills/openai-image-gen/scripts/test_gen.py b/skills/openai-image-gen/scripts/test_gen.py
index 76445c0bb78..79d418c3e4c 100644
--- a/skills/openai-image-gen/scripts/test_gen.py
+++ b/skills/openai-image-gen/scripts/test_gen.py
@@ -82,6 +82,8 @@ def test_normalize_output_format_normalizes_case_for_supported_values():
 
 def test_normalize_output_format_strips_whitespace_for_supported_values():
     assert normalize_output_format("gpt-image-1", " png ") == "png"
+
+
 def test_normalize_output_format_keeps_supported_values():
     assert normalize_output_format("gpt-image-1", "png") == "png"
     assert normalize_output_format("gpt-image-1", "jpeg") == "jpeg"
diff --git a/skills/skill-creator/scripts/quick_validate.py b/skills/skill-creator/scripts/quick_validate.py
index e8737b4f156..86897b373d7 100644
--- a/skills/skill-creator/scripts/quick_validate.py
+++ b/skills/skill-creator/scripts/quick_validate.py
@@ -6,7 +6,6 @@ Quick validation script for skills - minimal version
 import re
 import sys
 from pathlib import Path
-from typing import Optional
 
 try:
     import yaml
@@ -16,7 +15,7 @@ except ModuleNotFoundError:
 MAX_SKILL_NAME_LENGTH = 64
 
 
-def _extract_frontmatter(content: str) -> Optional[str]:
+def _extract_frontmatter(content: str) -> str | None:
     lines = content.splitlines()
     if not lines or lines[0].strip() != "---":
         return None
@@ -26,13 +25,13 @@ def _extract_frontmatter(content: str) -> Optional[str]:
     return None
 
 
-def _parse_simple_frontmatter(frontmatter_text: str) -> Optional[dict[str, str]]:
+def _parse_simple_frontmatter(frontmatter_text: str) -> dict[str, str] | None:
     """
     Minimal fallback parser used when PyYAML is unavailable.
     Supports simple `key: value` mappings used by SKILL.md frontmatter.
     """
     parsed: dict[str, str] = {}
-    current_key: Optional[str] = None
+    current_key: str | None = None
     for raw_line in frontmatter_text.splitlines():
         stripped = raw_line.strip()
         if not stripped or stripped.startswith("#"):
@@ -43,9 +42,7 @@ def _parse_simple_frontmatter(frontmatter_text: str) -> Optional[dict[str, str]]
             if current_key is None:
                 return None
             current_value = parsed[current_key]
-            parsed[current_key] = (
-                f"{current_value}\n{stripped}" if current_value else stripped
-            )
+            parsed[current_key] = f"{current_value}\n{stripped}" if current_value else stripped
             continue
 
         if ":" not in stripped:
@@ -55,9 +52,7 @@ def _parse_simple_frontmatter(frontmatter_text: str) -> Optional[dict[str, str]]
         value = value.strip()
         if not key:
             return None
-        if (value.startswith('"') and value.endswith('"')) or (
-            value.startswith("'") and value.endswith("'")
-        ):
+        if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
             value = value[1:-1]
         parsed[key] = value
         current_key = key
@@ -129,8 +124,7 @@ def validate_skill(skill_path):
         if len(name) > MAX_SKILL_NAME_LENGTH:
             return (
                 False,
-                f"Name is too long ({len(name)} characters). "
-                f"Maximum is {MAX_SKILL_NAME_LENGTH} characters.",
+                f"Name is too long ({len(name)} characters). Maximum is {MAX_SKILL_NAME_LENGTH} characters.",
             )
 
     description = frontmatter.get("description", "")

From a626c42fcd03744db962b12a9c3d9ee80f9990c7 Mon Sep 17 00:00:00 2001
From: Ayaan Zaidi <hi@obviy.us>
Date: Sun, 15 Mar 2026 09:48:08 +0530
Subject: [PATCH 11/16] fix(android): theme popup surfaces

---
 CHANGELOG.md                                      |  1 +
 .../java/ai/openclaw/app/ui/ConnectTabScreen.kt   | 14 +++++++++++---
 .../java/ai/openclaw/app/ui/OnboardingFlow.kt     | 15 ++++++++++++---
 .../java/ai/openclaw/app/ui/chat/ChatComposer.kt  | 10 +++++++++-
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b7e204965d7..b1f88842755 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai
 - Configure/startup: move outbound send-deps resolution into a lightweight helper so `openclaw configure` no longer stalls after the banner while eagerly loading channel plugins. (#46301) thanks @scoootscooob.
 - Zalo Personal/group gating: stop reapplying `dmPolicy.allowFrom` as a sender gate for already-allowlisted groups when `groupAllowFrom` is unset, so any member of an allowed group can trigger replies while DMs stay restricted. (#40146)
 - Plugins/install precedence: keep bundled plugins ahead of auto-discovered globals by default, but let an explicitly installed plugin record win its own duplicate-id tie so installed channel plugins load from `~/.openclaw/extensions` after `openclaw plugins install`.
+- Android/chat: theme the thinking dropdown and TLS trust dialogs explicitly so popup surfaces match the active app theme instead of falling back to mismatched Material defaults.
 
 ### Fixes
 
diff --git a/apps/android/app/src/main/java/ai/openclaw/app/ui/ConnectTabScreen.kt b/apps/android/app/src/main/java/ai/openclaw/app/ui/ConnectTabScreen.kt
index 0f5a2c7d08e..9ca0ad3f47f 100644
--- a/apps/android/app/src/main/java/ai/openclaw/app/ui/ConnectTabScreen.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/app/ui/ConnectTabScreen.kt
@@ -92,20 +92,28 @@ fun ConnectTabScreen(viewModel: MainViewModel) {
     val prompt = pendingTrust!!
     AlertDialog(
       onDismissRequest = { viewModel.declineGatewayTrustPrompt() },
-      title = { Text("Trust this gateway?") },
+      containerColor = mobileCardSurface,
+      title = { Text("Trust this gateway?", style = mobileHeadline, color = mobileText) },
       text = {
         Text(
           "First-time TLS connection.\n\nVerify this SHA-256 fingerprint before trusting:\n${prompt.fingerprintSha256}",
           style = mobileCallout,
+          color = mobileText,
         )
       },
       confirmButton = {
-        TextButton(onClick = { viewModel.acceptGatewayTrustPrompt() }) {
+        TextButton(
+          onClick = { viewModel.acceptGatewayTrustPrompt() },
+          colors = ButtonDefaults.textButtonColors(contentColor = mobileAccent),
+        ) {
           Text("Trust and continue")
         }
       },
       dismissButton = {
-        TextButton(onClick = { viewModel.declineGatewayTrustPrompt() }) {
+        TextButton(
+          onClick = { viewModel.declineGatewayTrustPrompt() },
+          colors = ButtonDefaults.textButtonColors(contentColor = mobileTextSecondary),
+        ) {
           Text("Cancel")
         }
       },
diff --git a/apps/android/app/src/main/java/ai/openclaw/app/ui/OnboardingFlow.kt b/apps/android/app/src/main/java/ai/openclaw/app/ui/OnboardingFlow.kt
index 8c4df5beed5..28487439c0b 100644
--- a/apps/android/app/src/main/java/ai/openclaw/app/ui/OnboardingFlow.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/app/ui/OnboardingFlow.kt
@@ -455,19 +455,28 @@ fun OnboardingFlow(viewModel: MainViewModel, modifier: Modifier = Modifier) {
     val prompt = pendingTrust!!
     AlertDialog(
       onDismissRequest = { viewModel.declineGatewayTrustPrompt() },
-      title = { Text("Trust this gateway?") },
+      containerColor = onboardingSurface,
+      title = { Text("Trust this gateway?", style = onboardingHeadlineStyle, color = onboardingText) },
       text = {
         Text(
           "First-time TLS connection.\n\nVerify this SHA-256 fingerprint before trusting:\n${prompt.fingerprintSha256}",
+          style = onboardingCalloutStyle,
+          color = onboardingText,
         )
       },
       confirmButton = {
-        TextButton(onClick = { viewModel.acceptGatewayTrustPrompt() }) {
+        TextButton(
+          onClick = { viewModel.acceptGatewayTrustPrompt() },
+          colors = ButtonDefaults.textButtonColors(contentColor = onboardingAccent),
+        ) {
           Text("Trust and continue")
         }
       },
       dismissButton = {
-        TextButton(onClick = { viewModel.declineGatewayTrustPrompt() }) {
+        TextButton(
+          onClick = { viewModel.declineGatewayTrustPrompt() },
+          colors = ButtonDefaults.textButtonColors(contentColor = onboardingTextSecondary),
+        ) {
           Text("Cancel")
         }
       },
diff --git a/apps/android/app/src/main/java/ai/openclaw/app/ui/chat/ChatComposer.kt b/apps/android/app/src/main/java/ai/openclaw/app/ui/chat/ChatComposer.kt
index f641486794b..1adcc34c2d6 100644
--- a/apps/android/app/src/main/java/ai/openclaw/app/ui/chat/ChatComposer.kt
+++ b/apps/android/app/src/main/java/ai/openclaw/app/ui/chat/ChatComposer.kt
@@ -128,7 +128,15 @@ fun ChatComposer(
           }
         }
 
-        DropdownMenu(expanded = showThinkingMenu, onDismissRequest = { showThinkingMenu = false }) {
+        DropdownMenu(
+          expanded = showThinkingMenu,
+          onDismissRequest = { showThinkingMenu = false },
+          shape = RoundedCornerShape(16.dp),
+          containerColor = mobileCardSurface,
+          tonalElevation = 0.dp,
+          shadowElevation = 8.dp,
+          border = BorderStroke(1.dp, mobileBorder),
+        ) {
           ThinkingMenuItem("off", thinkingLevel, onSetThinkingLevel) { showThinkingMenu = false }
           ThinkingMenuItem("low", thinkingLevel, onSetThinkingLevel) { showThinkingMenu = false }
           ThinkingMenuItem("medium", thinkingLevel, onSetThinkingLevel) { showThinkingMenu = false }

From 10c7872856fa04da925594b07959fa3240b88f24 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sun, 15 Mar 2026 00:40:12 -0400
Subject: [PATCH 12/16] Auto-reply: fix followup media prompt rebuild

---
 src/auto-reply/reply/followup-media.ts       |  55 +++++--
 src/auto-reply/reply/followup-runner.test.ts | 143 +++++++++++++++++++
 2 files changed, 183 insertions(+), 15 deletions(-)

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index 61d3b9f9cfb..d862a77875b 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -55,6 +55,30 @@ function replaceLastOccurrence(
   return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
 }
 
+function findFirstOccurrenceBeforeFileBlocks(value: string, search: string): number {
+  if (!search) {
+    return -1;
+  }
+  const fileBlockIndex = value.search(FILE_BLOCK_RE);
+  const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value;
+  return bodyRegion.indexOf(search);
+}
+
+function replaceFirstOccurrenceBeforeFileBlocks(
+  value: string,
+  search: string,
+  replacement: string,
+): string | undefined {
+  if (!search) {
+    return undefined;
+  }
+  const index = findFirstOccurrenceBeforeFileBlocks(value, search);
+  if (index < 0) {
+    return undefined;
+  }
+  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
+}
+
 function stripInlineDirectives(text: string | undefined): string {
   return parseInlineDirectives(text ?? "").cleaned.trim();
 }
@@ -92,20 +116,29 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
     stripped = stripLeadingMediaReplyHint(stripped);
   }
 
+  const replacementTargets = [
+    params.originalBody?.trim(),
+    stripInlineDirectives(params.originalBody),
+    MEDIA_ONLY_PLACEHOLDER,
+  ].filter(
+    (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
+  );
+
   // Strip pre-existing file blocks from the body region when the updated body
   // contains new file blocks.  Mixed messages (audio + PDF) can arrive with
   // file extraction already applied in the primary path; without this strip
   // the old block stays in the prompt while the updated body adds a new one,
   // duplicating potentially large file payloads.
-  // Scope stripping to the body segment so quoted/replied text and thread
-  // history above the body retain any legitimate <file> blocks.
+  // Scope stripping to the confirmed body segment so quoted/replied text,
+  // thread history above the body, and prompts whose original body no longer
+  // appears all retain any legitimate <file> blocks.
   if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) {
-    const bodyTarget = params.originalBody?.trim();
-    const bodyIdx = bodyTarget ? stripped.lastIndexOf(bodyTarget) : -1;
+    const bodyIdx =
+      replacementTargets
+        .map((target) => findFirstOccurrenceBeforeFileBlocks(stripped, target))
+        .find((index) => index >= 0) ?? -1;
     if (bodyIdx >= 0) {
       stripped = stripped.slice(0, bodyIdx) + stripExistingFileBlocks(stripped.slice(bodyIdx));
-    } else {
-      stripped = stripExistingFileBlocks(stripped);
     }
   }
 
@@ -117,17 +150,9 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
     return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
   }
 
-  const replacementTargets = [
-    params.originalBody?.trim(),
-    stripInlineDirectives(params.originalBody),
-    MEDIA_ONLY_PLACEHOLDER,
-  ].filter(
-    (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
-  );
-
   let rebuilt = stripped;
   for (const target of replacementTargets) {
-    const replaced = replaceLastOccurrence(rebuilt, target, updatedBody);
+    const replaced = replaceFirstOccurrenceBeforeFileBlocks(rebuilt, target, updatedBody);
     if (replaced !== undefined) {
       rebuilt = replaced;
       return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index bae9fa98391..d06442fad5c 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -1821,6 +1821,149 @@ describe("createFollowupRunner media understanding", () => {
     expect(agentCall?.prompt).not.toContain("old extracted content");
   });
 
+  it("preserves unrelated file blocks when the original body is absent from the prompt", async () => {
+    const quotedFileBlock =
+      '<file name="thread.pdf" mime="application/pdf">\nquoted thread attachment\n</file>';
+    const existingFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nold extracted content\n</file>';
+    const newFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nnew extracted content\n</file>';
+    const transcriptText = "Transcript from deferred audio";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this\n\n${newFileBlock}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nQuoted thread above\n\n${quotedFileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${existingFileBlock}`,
+          CommandBody: "summarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"],
+          MediaTypes: ["audio/ogg", "application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("Quoted thread above");
+    expect(agentCall?.prompt).toContain(quotedFileBlock);
+    expect(agentCall?.prompt).toContain(newFileBlock);
+    expect(agentCall?.prompt?.match(/<file\s+name="/g)).toHaveLength(2);
+  });
+
+  it("replaces the visible body before file blocks instead of matching file content", async () => {
+    const existingFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nsummary notes:\nsummarize this\n</file>';
+    const transcriptText = "Transcript from deferred audio";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${existingFileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${existingFileBlock}`,
+          CommandBody: "summarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"],
+          MediaTypes: ["audio/ogg", "application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    const transcriptBlock = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this`;
+    expect(agentCall?.prompt).toContain(existingFileBlock);
+    expect(agentCall?.prompt).toContain(transcriptBlock);
+    expect(agentCall?.prompt?.indexOf(transcriptBlock)).toBeGreaterThan(-1);
+    expect(agentCall?.prompt?.indexOf(transcriptBlock)).toBeLessThan(
+      agentCall?.prompt?.indexOf(existingFileBlock) ?? -1,
+    );
+  });
+
   it("sets DeferredMediaApplied when media understanding throws", async () => {
     applyMediaUnderstandingMock.mockRejectedValueOnce(
       new Error("transcription service unavailable"),

From b853820eb229ce927acef66407e45c6e48e7565d Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sun, 15 Mar 2026 12:12:12 -0400
Subject: [PATCH 13/16] fix: address Codex and Greptile review comments on
 #46454

Replace regex-based file extraction detection (FILE_BLOCK_RE.test) with a
DeferredFileBlocksExtracted mutation marker on FollowupMediaContext.  The old
approach scanned user body text for '<file name=' patterns, which could
false-positive on literal user messages.  The new approach compares Body
against RawBody (never mutated by the primary path) to detect file extraction,
then stores a boolean marker for subsequent checks.

Fixes 2 & 4 (linked):
- Added DeferredFileBlocksExtracted marker to FollowupMediaContext type
- Mutation detection uses RawBody (not CommandBody) as reference to avoid
  false-positives when /think directives differ between Body and CommandBody
- snapshotUpdatedMediaContext propagates the marker via appliedFile

Fixes 1, 3, 5, 6 were already addressed by prior commits on this branch.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/auto-reply/reply/followup-media.ts | 33 ++++++++++++++++++++------
 src/auto-reply/reply/queue/types.ts    |  7 ++++++
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index d862a77875b..d9202554526 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -183,10 +183,18 @@ function hasOnlyFileLikeAttachments(mediaContext: FollowupMediaContext): boolean
   );
 }
 
+function hasAnyFileAttachments(mediaContext: FollowupMediaContext): boolean {
+  return normalizeAttachments(mediaContext as MsgContext).some((attachment) => {
+    const kind = resolveAttachmentKind(attachment);
+    return kind !== "audio" && kind !== "image" && kind !== "video";
+  });
+}
+
 function snapshotUpdatedMediaContext(params: {
   original: FollowupMediaContext;
   mediaCtx: MsgContext;
   updatedBody?: string;
+  appliedFile?: boolean;
 }): FollowupMediaContext {
   return {
     ...params.original,
@@ -202,6 +210,8 @@ function snapshotUpdatedMediaContext(params: {
       ? [...params.mediaCtx.MediaUnderstandingDecisions]
       : params.original.MediaUnderstandingDecisions,
     DeferredMediaApplied: true,
+    DeferredFileBlocksExtracted:
+      params.original.DeferredFileBlocksExtracted || params.appliedFile || undefined,
   };
 }
 
@@ -227,13 +237,21 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
 
   const resolvedOriginalBody =
     mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body;
-  // Detect file extraction from the primary path by checking whether Body was
-  // mutated AND contains file blocks.  This avoids false-positives when a user
-  // message literally contains "<file name=" text.
-  const fileExtractionAlreadyApplied =
-    mediaContext.Body !== resolvedOriginalBody && FILE_BLOCK_RE.test(mediaContext.Body ?? "");
+  // Detect file extraction from the primary path via body mutation instead of
+  // scanning for literal '<file name=' patterns (which false-positives on user
+  // text).  Compare Body against RawBody (never mutated by the primary path's
+  // media/file processing) rather than CommandBody (which differs from Body
+  // when inline directives like /think were stripped).
+  const referenceBody = mediaContext.RawBody ?? mediaContext.Body;
+  if (
+    !mediaContext.DeferredFileBlocksExtracted &&
+    mediaContext.Body !== referenceBody &&
+    hasAnyFileAttachments(mediaContext)
+  ) {
+    mediaContext.DeferredFileBlocksExtracted = true;
+  }
 
-  if (fileExtractionAlreadyApplied && hasOnlyFileLikeAttachments(mediaContext)) {
+  if (mediaContext.DeferredFileBlocksExtracted && hasOnlyFileLikeAttachments(mediaContext)) {
     mediaContext.DeferredMediaApplied = true;
     return;
   }
@@ -266,7 +284,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
       muResult.appliedAudio ||
       muResult.appliedImage ||
       muResult.appliedVideo ||
-      (muResult.appliedFile && !fileExtractionAlreadyApplied);
+      (muResult.appliedFile && !mediaContext.DeferredFileBlocksExtracted);
 
     if (shouldRebuildPrompt) {
       const newMediaNote = buildInboundMediaNote(mediaCtx);
@@ -285,6 +303,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
       original: mediaContext,
       mediaCtx,
       updatedBody: shouldRebuildPrompt ? mediaCtx.Body : undefined,
+      appliedFile: muResult.appliedFile,
     });
   } catch (err) {
     mediaContext.DeferredMediaApplied = true;
diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts
index 291059a28d7..e1e9e20e5c8 100644
--- a/src/auto-reply/reply/queue/types.ts
+++ b/src/auto-reply/reply/queue/types.ts
@@ -50,6 +50,13 @@ export type FollowupMediaContext = {
   AccountId?: string;
   MessageThreadId?: string | number;
   DeferredMediaApplied?: boolean;
+  /**
+   * Set when file extraction has already been applied to Body (either in the
+   * primary path or by a previous deferred-media run).  Checked instead of
+   * scanning body text for `<file` patterns to avoid false-positives on user
+   * messages that contain literal XML-like text.
+   */
+  DeferredFileBlocksExtracted?: boolean;
 };
 
 export type FollowupRun = {

From 280c5584f784fa8481112defeca85b602633e4c5 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sun, 15 Mar 2026 15:51:14 -0400
Subject: [PATCH 14/16] fix: use file-block-safe replacement in
 normalizeUpdatedBody and trailing fallback (#46454)

---
 src/auto-reply/reply/followup-media.test.ts | 107 ++++++++++++++++++++
 src/auto-reply/reply/followup-media.ts      |  44 +++++++-
 2 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 src/auto-reply/reply/followup-media.test.ts

diff --git a/src/auto-reply/reply/followup-media.test.ts b/src/auto-reply/reply/followup-media.test.ts
new file mode 100644
index 00000000000..d8ba453f801
--- /dev/null
+++ b/src/auto-reply/reply/followup-media.test.ts
@@ -0,0 +1,107 @@
+import { describe, expect, it } from "vitest";
+import {
+  _findLastOccurrenceBeforeFileBlocks as findLastOccurrenceBeforeFileBlocks,
+  _normalizeUpdatedBody as normalizeUpdatedBody,
+  _rebuildQueuedPromptWithMediaUnderstanding as rebuildQueuedPromptWithMediaUnderstanding,
+} from "./followup-media.js";
+
+const FILE_BLOCK = '<file name="doc.pdf" type="application/pdf">\nPDF content\n</file>';
+
+describe("findLastOccurrenceBeforeFileBlocks", () => {
+  it("returns -1 for empty search", () => {
+    expect(findLastOccurrenceBeforeFileBlocks("hello", "")).toBe(-1);
+  });
+
+  it("finds last occurrence in body region before file blocks", () => {
+    const value = `hello world hello\n${FILE_BLOCK}`;
+    // "hello" appears at 0 and 12 — both before the file block
+    expect(findLastOccurrenceBeforeFileBlocks(value, "hello")).toBe(12);
+  });
+
+  it("does not match inside file block content", () => {
+    const value = `some text\n${FILE_BLOCK}\nPDF content`;
+    // "PDF content" appears in the file block and after it, but the body region
+    // (before <file) is just "some text\n" — no match there.
+    expect(findLastOccurrenceBeforeFileBlocks(value, "PDF content")).toBe(-1);
+  });
+
+  it("uses lastIndexOf in fallback when search itself contains file blocks", () => {
+    // When the search string contains a <file> block, it can't appear in the
+    // body-only region, so the fallback searches the full value.
+    const bodyWithFile = `caption\n${FILE_BLOCK}`;
+    const value = `previous\n${bodyWithFile}\nlater\n${bodyWithFile}`;
+    // Should find the *last* (trailing) occurrence
+    const expected = value.lastIndexOf(bodyWithFile);
+    expect(findLastOccurrenceBeforeFileBlocks(value, bodyWithFile)).toBe(expected);
+    expect(expected).toBeGreaterThan(value.indexOf(bodyWithFile));
+  });
+
+  it("returns index when no file blocks exist in value", () => {
+    expect(findLastOccurrenceBeforeFileBlocks("abc abc", "abc")).toBe(4);
+  });
+});
+
+describe("normalizeUpdatedBody", () => {
+  it("returns empty string when updatedBody is empty", () => {
+    expect(normalizeUpdatedBody({ originalBody: "foo", updatedBody: "" })).toBe("");
+  });
+
+  it("returns updatedBody when originalBody is empty", () => {
+    expect(normalizeUpdatedBody({ updatedBody: "hello" })).toBe("hello");
+  });
+
+  it("strips directives when updatedBody equals originalBody", () => {
+    const body = "/think high tell me a joke";
+    const result = normalizeUpdatedBody({ originalBody: body, updatedBody: body });
+    expect(result).toBe("tell me a joke");
+  });
+
+  it("does not corrupt file block content during directive cleanup", () => {
+    const originalBody = "/think high tell me about this file";
+    // updatedBody has the original body plus a file block appended by media processing
+    const updatedBody = `${originalBody}\n${FILE_BLOCK}`;
+    const result = normalizeUpdatedBody({ originalBody, updatedBody });
+    // The directive should be stripped from the body portion, file block preserved
+    expect(result).toContain("tell me about this file");
+    expect(result).toContain(FILE_BLOCK);
+    expect(result).not.toContain("/think");
+  });
+
+  it("replaces in body region, not inside file blocks", () => {
+    const originalBody = "PDF content";
+    const updatedBody = `PDF content\n<file name="doc.pdf" type="application/pdf">\nPDF content\n</file>`;
+    // The replacement should target the body region "PDF content" before the
+    // file block, not the "PDF content" inside the <file> block.
+    const result = normalizeUpdatedBody({ originalBody, updatedBody });
+    // With no directives to strip, original === cleaned, updatedBody !== originalBody
+    // because updatedBody has the file block appended.  The replacement targets the
+    // body-region occurrence.
+    expect(result).toContain('<file name="doc.pdf"');
+    expect(result).toContain("PDF content\n</file>");
+  });
+});
+
+describe("rebuildQueuedPromptWithMediaUnderstanding", () => {
+  it("replaces original body with updated body in prompt", () => {
+    const result = rebuildQueuedPromptWithMediaUnderstanding({
+      prompt: "thread context\nhello world",
+      originalBody: "hello world",
+      updatedBody: 'hello world\n<file name="a.pdf">data</file>',
+    });
+    expect(result).toContain('<file name="a.pdf">data</file>');
+    expect(result).toContain("thread context");
+  });
+
+  it("preserves file blocks in thread history when body is replaced", () => {
+    const prompt = `history\n<file name="old.pdf">old</file>\nhello world`;
+    const result = rebuildQueuedPromptWithMediaUnderstanding({
+      prompt,
+      originalBody: "hello world",
+      updatedBody: "hello world transcribed",
+    });
+    // The old file block from history should be preserved since updatedBody
+    // has no file blocks of its own.
+    expect(result).toContain('<file name="old.pdf">old</file>');
+    expect(result).toContain("hello world transcribed");
+  });
+});
diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index d9202554526..f40ec91f109 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -64,6 +64,40 @@ function findFirstOccurrenceBeforeFileBlocks(value: string, search: string): num
   return bodyRegion.indexOf(search);
 }
 
+function findLastOccurrenceBeforeFileBlocks(value: string, search: string): number {
+  if (!search) {
+    return -1;
+  }
+  const fileBlockIndex = value.search(FILE_BLOCK_RE);
+  const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value;
+  const index = bodyRegion.lastIndexOf(search);
+  if (index >= 0) {
+    return index;
+  }
+  // Fallback: search string itself contains file blocks — it can't appear in the
+  // body-only region.  Search the full value with lastIndexOf to pick the trailing
+  // (most recent) occurrence when thread/history has identical <file> bodies.
+  if (FILE_BLOCK_RE.test(search)) {
+    return value.lastIndexOf(search);
+  }
+  return -1;
+}
+
+function replaceLastOccurrenceBeforeFileBlocks(
+  value: string,
+  search: string,
+  replacement: string,
+): string | undefined {
+  if (!search) {
+    return undefined;
+  }
+  const index = findLastOccurrenceBeforeFileBlocks(value, search);
+  if (index < 0) {
+    return undefined;
+  }
+  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
+}
+
 function replaceFirstOccurrenceBeforeFileBlocks(
   value: string,
   search: string,
@@ -101,7 +135,8 @@ function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: str
     return cleanedOriginalBody;
   }
   return (
-    replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody
+    replaceLastOccurrenceBeforeFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ??
+    updatedBody
   ).trim();
 }
 
@@ -215,6 +250,13 @@ function snapshotUpdatedMediaContext(params: {
   };
 }
 
+// Exported for unit testing — these are pure string helpers with no side effects.
+export {
+  findLastOccurrenceBeforeFileBlocks as _findLastOccurrenceBeforeFileBlocks,
+  normalizeUpdatedBody as _normalizeUpdatedBody,
+  rebuildQueuedPromptWithMediaUnderstanding as _rebuildQueuedPromptWithMediaUnderstanding,
+};
+
 export async function applyDeferredMediaUnderstandingToQueuedRun(
   queued: FollowupRun,
   params: { logLabel?: string } = {},

From 4092f82615ccdcad2a90c95ee3418f4abc3642b9 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sun, 15 Mar 2026 17:34:56 -0400
Subject: [PATCH 15/16] fix: trailing body match and RawBody-missing extraction
 detection (#46454)

---
 src/auto-reply/reply/followup-media.ts       | 171 ++++++++++++-------
 src/auto-reply/reply/followup-runner.test.ts | 158 +++++++++++++++++
 src/auto-reply/reply/queue/drain.ts          |   9 +-
 3 files changed, 272 insertions(+), 66 deletions(-)

diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index f40ec91f109..425bdd601ea 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -1,3 +1,4 @@
+import path from "node:path";
 import { logVerbose } from "../../globals.js";
 import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
 import {
@@ -40,30 +41,6 @@ function stripLeadingMediaReplyHint(prompt: string): string {
   return prompt.trim();
 }
 
-function replaceLastOccurrence(
-  value: string,
-  search: string,
-  replacement: string,
-): string | undefined {
-  if (!search) {
-    return undefined;
-  }
-  const index = value.lastIndexOf(search);
-  if (index < 0) {
-    return undefined;
-  }
-  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
-}
-
-function findFirstOccurrenceBeforeFileBlocks(value: string, search: string): number {
-  if (!search) {
-    return -1;
-  }
-  const fileBlockIndex = value.search(FILE_BLOCK_RE);
-  const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value;
-  return bodyRegion.indexOf(search);
-}
-
 function findLastOccurrenceBeforeFileBlocks(value: string, search: string): number {
   if (!search) {
     return -1;
@@ -98,21 +75,81 @@ function replaceLastOccurrenceBeforeFileBlocks(
   return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
 }
 
-function replaceFirstOccurrenceBeforeFileBlocks(
+function findTrailingReplacementTargetBeforeFileBlocks(
+  value: string,
+  targets: string[],
+): { index: number; target: string } | undefined {
+  let bestMatch: { index: number; target: string } | undefined;
+  for (const target of targets) {
+    const index = findLastOccurrenceBeforeFileBlocks(value, target);
+    if (index < 0) {
+      continue;
+    }
+    if (!bestMatch || index > bestMatch.index) {
+      bestMatch = { index, target };
+    }
+  }
+  return bestMatch;
+}
+
+function replaceOccurrenceAtIndex(
   value: string,
   search: string,
   replacement: string,
-): string | undefined {
-  if (!search) {
-    return undefined;
-  }
-  const index = findFirstOccurrenceBeforeFileBlocks(value, search);
-  if (index < 0) {
-    return undefined;
-  }
+  index: number,
+): string {
   return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
 }
 
+function decodeXmlAttr(value: string): string {
+  return value
+    .replace(/&quot;/g, '"')
+    .replace(/&apos;/g, "'")
+    .replace(/&lt;/g, "<")
+    .replace(/&gt;/g, ">")
+    .replace(/&amp;/g, "&");
+}
+
+function extractAttachmentFileName(value?: string): string | undefined {
+  const trimmed = value?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  if (/^[a-zA-Z][a-zA-Z\d+.-]*:/.test(trimmed)) {
+    try {
+      const pathname = new URL(trimmed).pathname;
+      const basename = path.posix.basename(pathname);
+      return basename || undefined;
+    } catch {
+      // Fall back to path-style parsing below.
+    }
+  }
+  const normalized = trimmed.replace(/\\/g, "/");
+  const basename = path.posix.basename(normalized);
+  return basename || undefined;
+}
+
+function bodyContainsMatchingFileBlock(mediaContext: FollowupMediaContext): boolean {
+  const body = mediaContext.Body?.trim();
+  if (!body || !FILE_BLOCK_RE.test(body)) {
+    return false;
+  }
+  const bodyFileNames = new Set<string>();
+  for (const match of body.matchAll(/<file\s+name="([^"]*)"[^>]*>/gi)) {
+    const fileName = match[1]?.trim();
+    if (fileName) {
+      bodyFileNames.add(decodeXmlAttr(fileName));
+    }
+  }
+  if (bodyFileNames.size === 0) {
+    return false;
+  }
+  return normalizeAttachments(mediaContext as MsgContext).some((attachment) => {
+    const fileName = extractAttachmentFileName(attachment.path ?? attachment.url);
+    return Boolean(fileName && bodyFileNames.has(fileName));
+  });
+}
+
 function stripInlineDirectives(text: string | undefined): string {
   return parseInlineDirectives(text ?? "").cleaned.trim();
 }
@@ -168,12 +205,14 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
   // thread history above the body, and prompts whose original body no longer
   // appears all retain any legitimate <file> blocks.
   if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) {
-    const bodyIdx =
-      replacementTargets
-        .map((target) => findFirstOccurrenceBeforeFileBlocks(stripped, target))
-        .find((index) => index >= 0) ?? -1;
-    if (bodyIdx >= 0) {
-      stripped = stripped.slice(0, bodyIdx) + stripExistingFileBlocks(stripped.slice(bodyIdx));
+    const trailingMatch = findTrailingReplacementTargetBeforeFileBlocks(
+      stripped,
+      replacementTargets,
+    );
+    if (trailingMatch) {
+      stripped =
+        stripped.slice(0, trailingMatch.index) +
+        stripExistingFileBlocks(stripped.slice(trailingMatch.index));
     }
   }
 
@@ -186,12 +225,15 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: {
   }
 
   let rebuilt = stripped;
-  for (const target of replacementTargets) {
-    const replaced = replaceFirstOccurrenceBeforeFileBlocks(rebuilt, target, updatedBody);
-    if (replaced !== undefined) {
-      rebuilt = replaced;
-      return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
-    }
+  const trailingMatch = findTrailingReplacementTargetBeforeFileBlocks(rebuilt, replacementTargets);
+  if (trailingMatch) {
+    rebuilt = replaceOccurrenceAtIndex(
+      rebuilt,
+      trailingMatch.target,
+      updatedBody,
+      trailingMatch.index,
+    );
+    return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
   }
 
   rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n");
@@ -268,29 +310,29 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
   if (!mediaContext || mediaContext.DeferredMediaApplied) {
     return;
   }
-  if (mediaContext.MediaUnderstanding?.length) {
-    mediaContext.DeferredMediaApplied = true;
-    return;
-  }
   if (!hasMediaAttachments(mediaContext)) {
     mediaContext.DeferredMediaApplied = true;
     return;
   }
-
-  const resolvedOriginalBody =
-    mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body;
-  // Detect file extraction from the primary path via body mutation instead of
-  // scanning for literal '<file name=' patterns (which false-positives on user
-  // text).  Compare Body against RawBody (never mutated by the primary path's
-  // media/file processing) rather than CommandBody (which differs from Body
-  // when inline directives like /think were stripped).
   const referenceBody = mediaContext.RawBody ?? mediaContext.Body;
-  if (
-    !mediaContext.DeferredFileBlocksExtracted &&
-    mediaContext.Body !== referenceBody &&
-    hasAnyFileAttachments(mediaContext)
-  ) {
-    mediaContext.DeferredFileBlocksExtracted = true;
+  // Prefer RawBody-vs-Body comparison when RawBody exists. If RawBody is
+  // missing, fall back to explicit file-extraction signals instead of
+  // re-running extraction just because the clean pre-extraction body is gone.
+  if (!mediaContext.DeferredFileBlocksExtracted && hasAnyFileAttachments(mediaContext)) {
+    const rawBodyMissing = typeof mediaContext.RawBody !== "string";
+    if (mediaContext.Body !== referenceBody) {
+      mediaContext.DeferredFileBlocksExtracted = true;
+    } else if (
+      rawBodyMissing &&
+      (Boolean(mediaContext.MediaUnderstanding?.length) ||
+        bodyContainsMatchingFileBlock(mediaContext))
+    ) {
+      mediaContext.DeferredFileBlocksExtracted = true;
+    }
+  }
+  if (mediaContext.MediaUnderstanding?.length) {
+    mediaContext.DeferredMediaApplied = true;
+    return;
   }
 
   if (mediaContext.DeferredFileBlocksExtracted && hasOnlyFileLikeAttachments(mediaContext)) {
@@ -298,6 +340,9 @@ export async function applyDeferredMediaUnderstandingToQueuedRun(
     return;
   }
 
+  const resolvedOriginalBody =
+    mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body;
+
   try {
     const mediaCtx = {
       ...mediaContext,
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index d06442fad5c..045d8ad0658 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -2034,6 +2034,114 @@ describe("createFollowupRunner media understanding", () => {
     };
     expect(agentCall?.prompt?.match(/<file\s+name="report\.pdf"/g)).toHaveLength(1);
   });
+
+  it("does not re-apply file extraction when RawBody is missing but Body already has a matching file block", async () => {
+    const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${fileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${fileBlock}`,
+          CommandBody: "summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(fileBlock);
+  });
+
+  it("replaces the trailing repeated body segment instead of the first matching thread text", async () => {
+    const existingFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nold extracted content\n</file>';
+    const newFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nnew extracted content\n</file>';
+    const transcriptText = "Deferred transcript";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this\n\n${newFileBlock}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nThread history: summarize this\n\n` +
+          `summarize this\n\n${existingFileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${existingFileBlock}`,
+          CommandBody: "summarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"],
+          MediaTypes: ["audio/ogg", "application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("Thread history: summarize this");
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).toContain(newFileBlock);
+    expect(agentCall?.prompt).not.toContain("old extracted content");
+    expect(agentCall?.prompt?.indexOf(newFileBlock)).toBeGreaterThan(
+      agentCall?.prompt?.lastIndexOf("summarize this") ?? -1,
+    );
+  });
 });
 
 describe("followup queue drain deferred media understanding", () => {
@@ -2102,6 +2210,56 @@ describe("followup queue drain deferred media understanding", () => {
     expect(prompt).not.toContain("[media attached: /tmp/voice.ogg");
   });
 
+  it("preprocesses queued runs in parallel", async () => {
+    const resolvers: Array<() => void> = [];
+    const done = () => ({
+      outputs: [],
+      decisions: [],
+      appliedImage: false,
+      appliedAudio: false,
+      appliedVideo: false,
+      appliedFile: false,
+    });
+    applyMediaUnderstandingMock.mockImplementation(
+      async () =>
+        await new Promise((resolve) => {
+          resolvers.push(() => resolve(done()));
+        }),
+    );
+
+    const items: FollowupRun[] = [
+      createQueuedRun({
+        prompt: "[media attached: /tmp/voice-1.ogg (audio/ogg)]\nfirst text",
+        summaryLine: "first text",
+        run: { messageProvider: "telegram" },
+        mediaContext: {
+          Body: "first text",
+          MediaPaths: ["/tmp/voice-1.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+      createQueuedRun({
+        prompt: "[media attached: /tmp/voice-2.ogg (audio/ogg)]\nsecond text",
+        summaryLine: "second text",
+        run: { messageProvider: "telegram" },
+        mediaContext: {
+          Body: "second text",
+          MediaPaths: ["/tmp/voice-2.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    ];
+
+    const pending = applyDeferredMediaToQueuedRuns(items);
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(2);
+
+    for (const resolve of resolvers) {
+      resolve();
+    }
+    await pending;
+  });
+
   it("preprocesses dropped media items before building overflow summaries", async () => {
     applyMediaUnderstandingMock.mockImplementationOnce(
       async (params: { ctx: Record<string, unknown> }) => {
diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts
index 471c94200ba..5d53df34189 100644
--- a/src/auto-reply/reply/queue/drain.ts
+++ b/src/auto-reply/reply/queue/drain.ts
@@ -76,9 +76,12 @@ function clearFollowupQueueSummaryState(queue: FollowupQueueState): void {
 }
 
 export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Promise<void> {
-  for (const item of items) {
-    await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" });
-  }
+  await Promise.allSettled(
+    items.map(
+      async (item) =>
+        await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }),
+    ),
+  );
 }
 
 async function resolveSummaryLines(items: FollowupRun[]): Promise<string[]> {

From 42b1271c85489942819476245c767c3033fdce25 Mon Sep 17 00:00:00 2001
From: Joey Krug <joeykrug@gmail.com>
Date: Sun, 15 Mar 2026 18:00:45 -0400
Subject: [PATCH 16/16] fix: parallelize deferred media calls and search full
 prompt outside file blocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Parallelize deferred media understanding calls in resolveSummaryLines and
applyDeferredMediaToQueuedRuns using Promise.allSettled so media API calls
run concurrently while summary line order stays sequential.

Replace findFirstOccurrenceBeforeFileBlocks (which truncated at the first
<file> tag) with findLastOccurrenceOutsideFileBlocks that searches the full
prompt via lastIndexOf and skips matches inside <file>…</file> blocks. This
fixes body replacement when thread/history context has extracted file blocks
before the current queued message body.

Add regression test for body appearing after thread-history file blocks.
---
 src/auto-reply/reply/followup-media.test.ts  | 40 +++++++----
 src/auto-reply/reply/followup-media.ts       | 58 +++++++++++-----
 src/auto-reply/reply/followup-runner.test.ts | 72 ++++++++++++++++++++
 src/auto-reply/reply/queue/drain.ts          | 21 +++---
 4 files changed, 152 insertions(+), 39 deletions(-)

diff --git a/src/auto-reply/reply/followup-media.test.ts b/src/auto-reply/reply/followup-media.test.ts
index d8ba453f801..77996f85606 100644
--- a/src/auto-reply/reply/followup-media.test.ts
+++ b/src/auto-reply/reply/followup-media.test.ts
@@ -1,43 +1,57 @@
 import { describe, expect, it } from "vitest";
 import {
-  _findLastOccurrenceBeforeFileBlocks as findLastOccurrenceBeforeFileBlocks,
+  _findLastOccurrenceOutsideFileBlocks as findLastOccurrenceOutsideFileBlocks,
   _normalizeUpdatedBody as normalizeUpdatedBody,
   _rebuildQueuedPromptWithMediaUnderstanding as rebuildQueuedPromptWithMediaUnderstanding,
 } from "./followup-media.js";
 
 const FILE_BLOCK = '<file name="doc.pdf" type="application/pdf">\nPDF content\n</file>';
 
-describe("findLastOccurrenceBeforeFileBlocks", () => {
+describe("findLastOccurrenceOutsideFileBlocks", () => {
   it("returns -1 for empty search", () => {
-    expect(findLastOccurrenceBeforeFileBlocks("hello", "")).toBe(-1);
+    expect(findLastOccurrenceOutsideFileBlocks("hello", "")).toBe(-1);
   });
 
   it("finds last occurrence in body region before file blocks", () => {
     const value = `hello world hello\n${FILE_BLOCK}`;
     // "hello" appears at 0 and 12 — both before the file block
-    expect(findLastOccurrenceBeforeFileBlocks(value, "hello")).toBe(12);
+    expect(findLastOccurrenceOutsideFileBlocks(value, "hello")).toBe(12);
   });
 
-  it("does not match inside file block content", () => {
+  it("skips matches inside file block content", () => {
+    // "PDF content" appears only inside the file block — no valid match outside.
+    const value = `some text\n${FILE_BLOCK}`;
+    expect(findLastOccurrenceOutsideFileBlocks(value, "PDF content")).toBe(-1);
+  });
+
+  it("finds trailing occurrence outside file block even when also inside one", () => {
     const value = `some text\n${FILE_BLOCK}\nPDF content`;
-    // "PDF content" appears in the file block and after it, but the body region
-    // (before <file) is just "some text\n" — no match there.
-    expect(findLastOccurrenceBeforeFileBlocks(value, "PDF content")).toBe(-1);
+    // "PDF content" appears inside the file block AND after it — the function
+    // should return the trailing occurrence that is outside the block.
+    const expected = value.lastIndexOf("PDF content");
+    expect(findLastOccurrenceOutsideFileBlocks(value, "PDF content")).toBe(expected);
   });
 
-  it("uses lastIndexOf in fallback when search itself contains file blocks", () => {
-    // When the search string contains a <file> block, it can't appear in the
-    // body-only region, so the fallback searches the full value.
+  it("finds occurrence when search itself contains file blocks", () => {
     const bodyWithFile = `caption\n${FILE_BLOCK}`;
     const value = `previous\n${bodyWithFile}\nlater\n${bodyWithFile}`;
     // Should find the *last* (trailing) occurrence
     const expected = value.lastIndexOf(bodyWithFile);
-    expect(findLastOccurrenceBeforeFileBlocks(value, bodyWithFile)).toBe(expected);
+    expect(findLastOccurrenceOutsideFileBlocks(value, bodyWithFile)).toBe(expected);
     expect(expected).toBeGreaterThan(value.indexOf(bodyWithFile));
   });
 
   it("returns index when no file blocks exist in value", () => {
-    expect(findLastOccurrenceBeforeFileBlocks("abc abc", "abc")).toBe(4);
+    expect(findLastOccurrenceOutsideFileBlocks("abc abc", "abc")).toBe(4);
+  });
+
+  it("finds body text after thread-history file blocks", () => {
+    const value = `Thread history\n${FILE_BLOCK}\n\ncheck this out`;
+    // The body "check this out" appears after a file block from thread history.
+    // The old truncation approach would miss this; the new approach finds it.
+    expect(findLastOccurrenceOutsideFileBlocks(value, "check this out")).toBe(
+      value.lastIndexOf("check this out"),
+    );
   });
 });
 
diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
index 425bdd601ea..f0d5d951683 100644
--- a/src/auto-reply/reply/followup-media.ts
+++ b/src/auto-reply/reply/followup-media.ts
@@ -41,26 +41,48 @@ function stripLeadingMediaReplyHint(prompt: string): string {
   return prompt.trim();
 }
 
-function findLastOccurrenceBeforeFileBlocks(value: string, search: string): number {
+/** Collect the [start, end) ranges of every `<file …>…</file>` block in `value`. */
+function collectFileBlockRanges(value: string): Array<[number, number]> {
+  const ranges: Array<[number, number]> = [];
+  const re = new RegExp(FILE_BLOCK_FULL_RE.source, FILE_BLOCK_FULL_RE.flags);
+  let m: RegExpExecArray | null;
+  while ((m = re.exec(value)) !== null) {
+    ranges.push([m.index, m.index + m[0].length]);
+  }
+  return ranges;
+}
+
+function isInsideFileBlock(
+  position: number,
+  length: number,
+  ranges: Array<[number, number]>,
+): boolean {
+  for (const [start, end] of ranges) {
+    if (position >= start && position + length <= end) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/**
+ * Find the last occurrence of `search` in `value` that is NOT inside a
+ * `<file …>…</file>` block.  Searches the full string with lastIndexOf,
+ * then walks backward past any matches that fall inside file blocks.
+ */
+function findLastOccurrenceOutsideFileBlocks(value: string, search: string): number {
   if (!search) {
     return -1;
   }
-  const fileBlockIndex = value.search(FILE_BLOCK_RE);
-  const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value;
-  const index = bodyRegion.lastIndexOf(search);
-  if (index >= 0) {
-    return index;
+  const ranges = collectFileBlockRanges(value);
+  let pos = value.lastIndexOf(search);
+  while (pos >= 0 && isInsideFileBlock(pos, search.length, ranges)) {
+    pos = value.lastIndexOf(search, pos - 1);
   }
-  // Fallback: search string itself contains file blocks — it can't appear in the
-  // body-only region.  Search the full value with lastIndexOf to pick the trailing
-  // (most recent) occurrence when thread/history has identical <file> bodies.
-  if (FILE_BLOCK_RE.test(search)) {
-    return value.lastIndexOf(search);
-  }
-  return -1;
+  return pos;
 }
 
-function replaceLastOccurrenceBeforeFileBlocks(
+function replaceLastOccurrenceOutsideFileBlocks(
   value: string,
   search: string,
   replacement: string,
@@ -68,7 +90,7 @@ function replaceLastOccurrenceBeforeFileBlocks(
   if (!search) {
     return undefined;
   }
-  const index = findLastOccurrenceBeforeFileBlocks(value, search);
+  const index = findLastOccurrenceOutsideFileBlocks(value, search);
   if (index < 0) {
     return undefined;
   }
@@ -81,7 +103,7 @@ function findTrailingReplacementTargetBeforeFileBlocks(
 ): { index: number; target: string } | undefined {
   let bestMatch: { index: number; target: string } | undefined;
   for (const target of targets) {
-    const index = findLastOccurrenceBeforeFileBlocks(value, target);
+    const index = findLastOccurrenceOutsideFileBlocks(value, target);
     if (index < 0) {
       continue;
     }
@@ -172,7 +194,7 @@ function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: str
     return cleanedOriginalBody;
   }
   return (
-    replaceLastOccurrenceBeforeFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ??
+    replaceLastOccurrenceOutsideFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ??
     updatedBody
   ).trim();
 }
@@ -294,7 +316,7 @@ function snapshotUpdatedMediaContext(params: {
 
 // Exported for unit testing — these are pure string helpers with no side effects.
 export {
-  findLastOccurrenceBeforeFileBlocks as _findLastOccurrenceBeforeFileBlocks,
+  findLastOccurrenceOutsideFileBlocks as _findLastOccurrenceOutsideFileBlocks,
   normalizeUpdatedBody as _normalizeUpdatedBody,
   rebuildQueuedPromptWithMediaUnderstanding as _rebuildQueuedPromptWithMediaUnderstanding,
 };
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index 045d8ad0658..ed076b4da59 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -1964,6 +1964,78 @@ describe("createFollowupRunner media understanding", () => {
     );
   });
 
+  it("finds the body after thread-history file blocks when body appears after the first <file> tag", async () => {
+    const threadFileBlock =
+      '<file name="thread.pdf" mime="application/pdf">\nolder thread attachment\n</file>';
+    const transcriptText = "Transcript from deferred voice note";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\ncheck this out`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // The prompt has thread history with a file block BEFORE the current
+    // queued body text.  The old truncation approach would miss the body
+    // because it only searched before the first <file> tag.
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/voice.ogg]\n${MEDIA_REPLY_HINT}\nThread history\n\n${threadFileBlock}\n\ncheck this out`,
+        mediaContext: {
+          Body: "check this out",
+          RawBody: "check this out",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    const transcriptBlock = `[Audio]\nTranscript:\n${transcriptText}\n\ncheck this out`;
+    // The body should be replaced with the transcript block
+    expect(agentCall?.prompt).toContain(transcriptBlock);
+    // Thread history and its file block should be preserved
+    expect(agentCall?.prompt).toContain("Thread history");
+    expect(agentCall?.prompt).toContain(threadFileBlock);
+  });
+
   it("sets DeferredMediaApplied when media understanding throws", async () => {
     applyMediaUnderstandingMock.mockRejectedValueOnce(
       new Error("transcription service unavailable"),
diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts
index 5d53df34189..41c859d2ec3 100644
--- a/src/auto-reply/reply/queue/drain.ts
+++ b/src/auto-reply/reply/queue/drain.ts
@@ -85,14 +85,19 @@ export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Prom
 }
 
 async function resolveSummaryLines(items: FollowupRun[]): Promise<string[]> {
-  const summaryLines: string[] = [];
-  for (const item of items) {
-    await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" });
-    // After deferred media, prefer the updated prompt (which includes transcripts)
-    // over the original summaryLine (which may just be the caption text).
-    summaryLines.push(buildQueueSummaryLine(item.prompt.trim() || item.summaryLine?.trim() || ""));
-  }
-  return summaryLines;
+  // Parallelize the media understanding API calls upfront (same pattern as
+  // applyDeferredMediaToQueuedRuns), then build summary lines sequentially
+  // so line order matches the original item order.
+  await Promise.allSettled(
+    items.map((item) =>
+      applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }),
+    ),
+  );
+  // After deferred media, prefer the updated prompt (which includes transcripts)
+  // over the original summaryLine (which may just be the caption text).
+  return items.map((item) =>
+    buildQueueSummaryLine(item.prompt.trim() || item.summaryLine?.trim() || ""),
+  );
 }
 
 export async function buildMediaAwareQueueSummaryPrompt(params: {