From 106a994a2036307417b5248b5bb3d2438ceccd4b Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 13:40:52 -0400 Subject: [PATCH 01/16] fix: apply media understanding to followup-queued messages (#44682) Voice notes arriving while the agent is mid-turn were queued as followup messages without audio transcription. The followup runner called runEmbeddedPiAgent directly, bypassing applyMediaUnderstanding. This adds a mediaContext field to FollowupRun that snapshots the original message's media fields. Before the agent run, the followup runner checks whether media understanding was applied. If not (empty MediaUnderstanding), it calls applyMediaUnderstanding and rebuilds the prompt with the transcript, matching the primary path's formatting. Co-Authored-By: Claude Opus 4.6 --- src/auto-reply/reply/followup-runner.test.ts | 285 ++++++++++++++++++- src/auto-reply/reply/followup-runner.ts | 65 ++++- src/auto-reply/reply/get-reply-run.ts | 34 ++- src/auto-reply/reply/queue.ts | 5 +- src/auto-reply/reply/queue/types.ts | 37 +++ 5 files changed, 416 insertions(+), 10 deletions(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index 8d12e815685..132b1291b39 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -2,13 +2,14 @@ import fs from "node:fs/promises"; import { tmpdir } from "node:os"; import path from "node:path"; import { beforeEach, describe, expect, it, vi } from "vitest"; -import { loadSessionStore, saveSessionStore, type SessionEntry } from "../../config/sessions.js"; +import { loadSessionStore, type SessionEntry, saveSessionStore } from "../../config/sessions.js"; import type { FollowupRun } from "./queue.js"; import { createMockFollowupRun, createMockTypingController } from "./test-helpers.js"; const runEmbeddedPiAgentMock = vi.fn(); const routeReplyMock = vi.fn(); const isRoutableChannelMock = vi.fn(); +const applyMediaUnderstandingMock = vi.fn(); vi.mock( "../../agents/model-fallback.js", @@ -19,6 +20,10 @@ vi.mock("../../agents/pi-embedded.js", () => ({ runEmbeddedPiAgent: (params: unknown) => runEmbeddedPiAgentMock(params), })); +vi.mock("../../media-understanding/apply.js", () => ({ + applyMediaUnderstanding: (params: unknown) => applyMediaUnderstandingMock(params), +})); + vi.mock("./route-reply.js", async (importOriginal) => { const actual = await importOriginal(); return { @@ -47,13 +52,24 @@ beforeEach(() => { isRoutableChannelMock.mockImplementation((ch: string | undefined) => Boolean(ch?.trim() && ROUTABLE_TEST_CHANNELS.has(ch.trim().toLowerCase())), ); + applyMediaUnderstandingMock.mockReset(); + applyMediaUnderstandingMock.mockResolvedValue({ + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: false, + }); }); const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun => createMockFollowupRun({ run: { messageProvider } }); function createQueuedRun( - overrides: Partial> & { run?: Partial } = {}, + overrides: Partial> & { + run?: Partial; + } = {}, ): FollowupRun { return createMockFollowupRun(overrides); } @@ -294,7 +310,12 @@ describe("createFollowupRunner messaging tool dedupe", () => { agentResult: { ...makeTextReplyDedupeResult(), messagingToolSentTargets: [ - { tool: "telegram", provider: "telegram", to: "268300329", accountId: "work" }, + { + tool: "telegram", + provider: "telegram", + to: "268300329", + accountId: "work", + }, ], }, queued: { @@ -344,8 +365,13 @@ describe("createFollowupRunner messaging tool dedupe", () => { "sessions.json", ); const sessionKey = "main"; - const sessionEntry: SessionEntry = { sessionId: "session", updatedAt: Date.now() }; - const sessionStore: Record = { [sessionKey]: sessionEntry }; + const sessionEntry: SessionEntry = { + sessionId: "session", + updatedAt: Date.now(), + }; + const sessionStore: Record = { + [sessionKey]: sessionEntry, + }; await saveSessionStore(storePath, sessionStore); const { onBlockReply } = await runMessagingCase({ @@ -539,7 +565,254 @@ describe("createFollowupRunner agentDir forwarding", () => { }); expect(runEmbeddedPiAgentMock).toHaveBeenCalledTimes(1); - const call = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { agentDir?: string }; + const call = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + agentDir?: string; + }; expect(call?.agentDir).toBe(agentDir); }); }); + +describe("createFollowupRunner media understanding", () => { + it("applies audio transcription when mediaContext has untranscribed audio", async () => { + const transcriptText = "Hello, this is a voice note."; + // The real applyMediaUnderstanding mutates the ctx; the mock must do the same + // so buildInboundMediaNote sees MediaUnderstanding and suppresses the audio line. + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "Got it!" }], + meta: {}, + }); + + const onBlockReply = vi.fn(async () => {}); + const runner = createFollowupRunner({ + opts: { onBlockReply }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + const queued = createQueuedRun({ + prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + // MediaUnderstanding is empty — transcription not yet applied + }, + }); + await runner(queued); + + // applyMediaUnderstanding should have been called + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + expect(applyMediaUnderstandingMock).toHaveBeenCalledWith( + expect.objectContaining({ + cfg: queued.run.config, + agentDir: queued.run.agentDir, + }), + ); + + // The prompt passed to the agent should include the transcript, not the + // raw audio attachment line. + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(transcriptText); + expect(agentCall?.prompt).not.toContain("[media attached: /tmp/voice.ogg"); + + expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" })); + }); + + it("skips media understanding when MediaUnderstanding is already populated", async () => { + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "reply" }], + meta: {}, + }); + + const onBlockReply = vi.fn(async () => {}); + const runner = createFollowupRunner({ + opts: { onBlockReply }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + const queued = createQueuedRun({ + prompt: "[Audio]\nTranscript:\nAlready transcribed.\n\nsome text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + // MediaUnderstanding already populated — transcription was applied in primary path + MediaUnderstanding: [ + { + kind: "audio.transcription", + text: "Already transcribed.", + attachmentIndex: 0, + provider: "whisper", + }, + ], + }, + }); + await runner(queued); + + // Should NOT re-run media understanding + expect(applyMediaUnderstandingMock).not.toHaveBeenCalled(); + + // The original prompt should be passed through unchanged + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("Already transcribed."); + }); + + it("skips media understanding when no mediaContext is present", async () => { + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "reply" }], + meta: {}, + }); + + const onBlockReply = vi.fn(async () => {}); + const runner = createFollowupRunner({ + opts: { onBlockReply }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + // No mediaContext (plain text message) + const queued = createQueuedRun({ prompt: "just text" }); + await runner(queued); + + expect(applyMediaUnderstandingMock).not.toHaveBeenCalled(); + }); + + it("continues with raw prompt when media understanding fails", async () => { + applyMediaUnderstandingMock.mockRejectedValueOnce(new Error("transcription service down")); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "fallback reply" }], + meta: {}, + }); + + const onBlockReply = vi.fn(async () => {}); + const runner = createFollowupRunner({ + opts: { onBlockReply }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + const originalPrompt = "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text"; + const queued = createQueuedRun({ + prompt: originalPrompt, + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }); + await runner(queued); + + // Should have attempted media understanding + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + + // Agent should still run with the original prompt + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toBe(originalPrompt); + + expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "fallback reply" })); + }); + + it("preserves non-audio media lines when only audio is transcribed", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + // Simulate transcription updating the context + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: "voice transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = "voice transcript"; + return { + outputs: [ + { + kind: "audio.transcription", + text: "voice transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "got both" }], + meta: {}, + }); + + const onBlockReply = vi.fn(async () => {}); + const runner = createFollowupRunner({ + opts: { onBlockReply }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + const queued = createQueuedRun({ + prompt: + "[media attached: 2 files]\n[media attached 1/2: /tmp/voice.ogg (audio/ogg)]\n[media attached 2/2: /tmp/photo.jpg (image/jpeg)]\nsome text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg", "/tmp/photo.jpg"], + MediaTypes: ["audio/ogg", "image/jpeg"], + }, + }); + await runner(queued); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + // Audio attachment line should be stripped + expect(agentCall?.prompt).not.toContain("voice.ogg"); + // Image attachment line should also be stripped (all media-attached lines are + // removed and replaced by the new buildInboundMediaNote output) + // The transcript should be present + expect(agentCall?.prompt).toContain("voice transcript"); + }); +}); diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index 8c7eccb5f02..7a59cea3fbd 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -9,10 +9,13 @@ import type { SessionEntry } from "../../config/sessions.js"; import type { TypingMode } from "../../config/types.js"; import { logVerbose } from "../../globals.js"; import { registerAgentRunContext } from "../../infra/agent-events.js"; +import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; +import { formatMediaUnderstandingBody } from "../../media-understanding/format.js"; import { defaultRuntime } from "../../runtime.js"; import { isInternalMessageChannel } from "../../utils/message-channel.js"; import { stripHeartbeatToken } from "../heartbeat.js"; -import type { OriginatingChannelType } from "../templating.js"; +import { buildInboundMediaNote } from "../media-note.js"; +import type { MsgContext, OriginatingChannelType } from "../templating.js"; import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js"; import type { GetReplyOptions, ReplyPayload } from "../types.js"; import { resolveRunAuthProfile } from "./agent-runner-utils.js"; @@ -154,6 +157,66 @@ export function createFollowupRunner(params: { let bootstrapPromptWarningSignaturesSeen = resolveBootstrapWarningSignaturesSeen( activeSessionEntry?.systemPromptReport, ); + + // Apply media understanding for followup-queued messages when it was + // not applied (or failed) in the primary path. This ensures voice + // notes that arrived while the agent was mid-turn still get transcribed. + if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) { + const hasMedia = Boolean( + queued.mediaContext.MediaPath?.trim() || + (Array.isArray(queued.mediaContext.MediaPaths) && + queued.mediaContext.MediaPaths.length > 0), + ); + if (hasMedia) { + try { + const mediaCtx = { ...queued.mediaContext } as MsgContext; + const muResult = await applyMediaUnderstanding({ + ctx: mediaCtx, + cfg: queued.run.config, + agentDir: queued.run.agentDir, + activeModel: { + provider: queued.run.provider, + model: queued.run.model, + }, + }); + if (muResult.outputs.length > 0) { + // Rebuild the prompt with media understanding results baked in, + // matching the primary path's formatting. + const newMediaNote = buildInboundMediaNote(mediaCtx); + const transcriptBody = formatMediaUnderstandingBody({ + body: undefined, + outputs: muResult.outputs, + }); + + // Strip existing [media attached ...] lines from the prompt so + // they can be replaced by the updated media note (which excludes + // successfully-understood attachments like transcribed audio). + const stripped = queued.prompt + .replace(/\[media attached: \d+ files\]\n?/g, "") + .replace(/\[media attached[^\]]*\]\n?/g, ""); + + const parts: string[] = []; + if (newMediaNote) { + parts.push(newMediaNote); + } + if (transcriptBody) { + parts.push(transcriptBody); + } + parts.push(stripped.trim()); + queued.prompt = parts.filter(Boolean).join("\n\n"); + + logVerbose( + `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage})`, + ); + } + } catch (err) { + logVerbose( + `followup: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`, + ); + } + } + } + try { const fallbackResult = await runWithModelFallback({ cfg: queued.run.config, diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts index 760c42aed1a..4cf76e6ed31 100644 --- a/src/auto-reply/reply/get-reply-run.ts +++ b/src/auto-reply/reply/get-reply-run.ts @@ -384,7 +384,7 @@ export async function runPreparedReply( const mediaReplyHint = mediaNote ? "To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body." : undefined; - let prefixedCommandBody = mediaNote + const prefixedCommandBody = mediaNote ? [mediaNote, mediaReplyHint, prefixedBody ?? ""].filter(Boolean).join("\n").trim() : prefixedBody; if (!resolvedThinkLevel) { @@ -469,11 +469,43 @@ export async function runPreparedReply( isNewSession, }); const authProfileIdSource = sessionEntry?.authProfileOverrideSource; + // Snapshot media-related context for deferred media understanding in the + // followup runner. When MediaUnderstanding is already populated the runner + // knows transcription already succeeded and skips re-application. + const hasMediaAttachments = Boolean( + ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0), + ); + const mediaContext = hasMediaAttachments + ? { + Body: ctx.Body, + CommandBody: ctx.CommandBody, + RawBody: ctx.RawBody, + MediaPath: ctx.MediaPath, + MediaUrl: ctx.MediaUrl, + MediaType: ctx.MediaType, + MediaDir: ctx.MediaDir, + MediaPaths: ctx.MediaPaths ? [...ctx.MediaPaths] : undefined, + MediaUrls: ctx.MediaUrls ? [...ctx.MediaUrls] : undefined, + MediaTypes: ctx.MediaTypes ? [...ctx.MediaTypes] : undefined, + MediaRemoteHost: ctx.MediaRemoteHost, + Transcript: ctx.Transcript, + MediaUnderstanding: ctx.MediaUnderstanding ? [...ctx.MediaUnderstanding] : undefined, + MediaUnderstandingDecisions: ctx.MediaUnderstandingDecisions + ? [...ctx.MediaUnderstandingDecisions] + : undefined, + OriginatingChannel: ctx.OriginatingChannel, + OriginatingTo: ctx.OriginatingTo, + AccountId: ctx.AccountId, + MessageThreadId: ctx.MessageThreadId, + } + : undefined; + const followupRun = { prompt: queuedBody, messageId: sessionCtx.MessageSidFull ?? sessionCtx.MessageSid, summaryLine: baseBodyTrimmedRaw, enqueuedAt: Date.now(), + mediaContext, // Originating channel for reply routing. originatingChannel: ctx.OriginatingChannel, originatingTo: ctx.OriginatingTo, diff --git a/src/auto-reply/reply/queue.ts b/src/auto-reply/reply/queue.ts index b097b6c5193..5c3a56f64af 100644 --- a/src/auto-reply/reply/queue.ts +++ b/src/auto-reply/reply/queue.ts @@ -1,6 +1,6 @@ -export { extractQueueDirective } from "./queue/directive.js"; -export { clearSessionQueues } from "./queue/cleanup.js"; export type { ClearSessionQueueResult } from "./queue/cleanup.js"; +export { clearSessionQueues } from "./queue/cleanup.js"; +export { extractQueueDirective } from "./queue/directive.js"; export { scheduleFollowupDrain } from "./queue/drain.js"; export { enqueueFollowupRun, @@ -10,6 +10,7 @@ export { export { resolveQueueSettings } from "./queue/settings.js"; export { clearFollowupQueue } from "./queue/state.js"; export type { + FollowupMediaContext, FollowupRun, QueueDedupeMode, QueueDropPolicy, diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts index 507f77d499d..637ceb0a149 100644 --- a/src/auto-reply/reply/queue/types.ts +++ b/src/auto-reply/reply/queue/types.ts @@ -2,6 +2,10 @@ import type { ExecToolDefaults } from "../../../agents/bash-tools.js"; import type { SkillSnapshot } from "../../../agents/skills.js"; import type { OpenClawConfig } from "../../../config/config.js"; import type { SessionEntry } from "../../../config/sessions.js"; +import type { + MediaUnderstandingDecision, + MediaUnderstandingOutput, +} from "../../../media-understanding/types.js"; import type { InputProvenance } from "../../../sessions/input-provenance.js"; import type { OriginatingChannelType } from "../../templating.js"; import type { ElevatedLevel, ReasoningLevel, ThinkLevel, VerboseLevel } from "../directives.js"; @@ -19,12 +23,45 @@ export type QueueSettings = { export type QueueDedupeMode = "message-id" | "prompt" | "none"; +/** + * Snapshot of media-related context fields carried on a FollowupRun so that + * the followup runner can apply media understanding (e.g. voice-note + * transcription) when it was not applied — or failed — in the primary path. + */ +export type FollowupMediaContext = { + Body?: string; + CommandBody?: string; + RawBody?: string; + MediaPath?: string; + MediaUrl?: string; + MediaType?: string; + MediaDir?: string; + MediaPaths?: string[]; + MediaUrls?: string[]; + MediaTypes?: string[]; + MediaRemoteHost?: string; + Transcript?: string; + MediaUnderstanding?: MediaUnderstandingOutput[]; + MediaUnderstandingDecisions?: MediaUnderstandingDecision[]; + OriginatingChannel?: OriginatingChannelType; + OriginatingTo?: string; + AccountId?: string; + MessageThreadId?: string | number; +}; + export type FollowupRun = { prompt: string; /** Provider message ID, when available (for deduplication). */ messageId?: string; summaryLine?: string; enqueuedAt: number; + /** + * Media context snapshot from the original inbound message. + * When present and MediaUnderstanding is empty, the followup runner will + * attempt to apply media understanding (audio transcription, etc.) before + * passing the prompt to the agent. + */ + mediaContext?: FollowupMediaContext; /** * Originating channel for reply routing. * When set, replies should be routed back to this provider From b6112bbfdf72c519bb05f25044c17ade2a0a1f16 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 15:29:55 -0400 Subject: [PATCH 02/16] fix queued media-understanding prompt rebuild --- src/auto-reply/reply/followup-runner.test.ts | 170 ++++++++++++++++++- src/auto-reply/reply/followup-runner.ts | 116 ++++++++++--- 2 files changed, 260 insertions(+), 26 deletions(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index 132b1291b39..1faac44a8a3 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -66,6 +66,9 @@ beforeEach(() => { const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun => createMockFollowupRun({ run: { messageProvider } }); +const MEDIA_REPLY_HINT = + "To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body."; + function createQueuedRun( overrides: Partial> & { run?: Partial; @@ -576,7 +579,7 @@ describe("createFollowupRunner media understanding", () => { it("applies audio transcription when mediaContext has untranscribed audio", async () => { const transcriptText = "Hello, this is a voice note."; // The real applyMediaUnderstanding mutates the ctx; the mock must do the same - // so buildInboundMediaNote sees MediaUnderstanding and suppresses the audio line. + // so buildInboundMediaNote and queued prompt rebuilding see the transcribed body. applyMediaUnderstandingMock.mockImplementationOnce( async (params: { ctx: Record }) => { params.ctx.MediaUnderstanding = [ @@ -588,6 +591,7 @@ describe("createFollowupRunner media understanding", () => { }, ]; params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`; return { outputs: [ { @@ -764,6 +768,7 @@ describe("createFollowupRunner media understanding", () => { }, ]; params.ctx.Transcript = "voice transcript"; + params.ctx.Body = "[Audio]\nUser text:\nsome text\nTranscript:\nvoice transcript"; return { outputs: [ { @@ -815,4 +820,167 @@ describe("createFollowupRunner media understanding", () => { // The transcript should be present expect(agentCall?.prompt).toContain("voice transcript"); }); + + it("strips queued media lines when attachment paths or URLs contain a literal closing bracket", async () => { + const transcriptText = "Bracket-safe transcript"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "ok" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: + "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg?sig=abc]123]\n" + + MEDIA_REPLY_HINT + + "\n" + + "some text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice[0].ogg"], + MediaUrls: ["https://cdn.example.com/files[0].ogg?sig=abc]123"], + MediaTypes: ["audio/ogg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(transcriptText); + expect(agentCall?.prompt).not.toContain("/tmp/voice[0].ogg"); + expect(agentCall?.prompt).not.toContain("https://cdn.example.com/files[0].ogg?sig=abc]123"); + expect(agentCall?.prompt).not.toContain(MEDIA_REPLY_HINT); + }); + + it("preserves file-only media understanding when outputs are empty", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = + '\nQuarterly report body\n'; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\n[User sent media without caption]`, + mediaContext: { + Body: "", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("[media attached: /tmp/report.pdf (application/pdf)]"); + expect(agentCall?.prompt).toContain(MEDIA_REPLY_HINT); + expect(agentCall?.prompt).toContain(''); + expect(agentCall?.prompt).toContain("Quarterly report body"); + expect(agentCall?.prompt).not.toContain("[User sent media without caption]"); + }); + + it("replaces the queued body when inline directives were already stripped from the prompt", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = + '/think high summarize this\n\n\nreport\n'; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`, + mediaContext: { + Body: "/think high summarize this", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("summarize this"); + expect(agentCall?.prompt).toContain(''); + expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this"); + expect(agentCall?.prompt).not.toContain("/think high summarize this"); + }); }); diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index 7a59cea3fbd..1688ffe9ccc 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -10,7 +10,6 @@ import type { TypingMode } from "../../config/types.js"; import { logVerbose } from "../../globals.js"; import { registerAgentRunContext } from "../../infra/agent-events.js"; import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; -import { formatMediaUnderstandingBody } from "../../media-understanding/format.js"; import { defaultRuntime } from "../../runtime.js"; import { isInternalMessageChannel } from "../../utils/message-channel.js"; import { stripHeartbeatToken } from "../heartbeat.js"; @@ -19,6 +18,7 @@ import type { MsgContext, OriginatingChannelType } from "../templating.js"; import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js"; import type { GetReplyOptions, ReplyPayload } from "../types.js"; import { resolveRunAuthProfile } from "./agent-runner-utils.js"; +import { parseInlineDirectives } from "./directive-handling.js"; import { resolveOriginAccountId, resolveOriginMessageProvider, @@ -37,6 +37,86 @@ import { incrementRunCompactionCount, persistRunSessionUsage } from "./session-r import { createTypingSignaler } from "./typing-mode.js"; import type { TypingController } from "./typing.js"; +const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; +const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; + +function stripLeadingMediaAttachedLines(prompt: string): string { + const lines = prompt.split("\n"); + let index = 0; + while (index < lines.length) { + const trimmed = lines[index]?.trim() ?? ""; + if (!trimmed.startsWith("[media attached") || !trimmed.endsWith("]")) { + break; + } + index += 1; + } + return lines.slice(index).join("\n").trim(); +} + +function stripLeadingMediaReplyHint(prompt: string): string { + const lines = prompt.split("\n"); + if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) { + return lines.slice(1).join("\n").trim(); + } + return prompt.trim(); +} + +function replaceLastOccurrence( + value: string, + search: string, + replacement: string, +): string | undefined { + if (!search) { + return undefined; + } + const index = value.lastIndexOf(search); + if (index < 0) { + return undefined; + } + return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; +} + +function stripInlineDirectives(text: string | undefined): string { + return parseInlineDirectives(text ?? "").cleaned.trim(); +} + +function rebuildQueuedPromptWithMediaUnderstanding(params: { + prompt: string; + originalBody?: string; + updatedBody?: string; + mediaNote?: string; +}): string { + let stripped = stripLeadingMediaAttachedLines(params.prompt); + if (!params.mediaNote) { + stripped = stripLeadingMediaReplyHint(stripped); + } + + const updatedBody = stripInlineDirectives(params.updatedBody); + if (!updatedBody) { + return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim(); + } + + const replacementTargets = [ + params.originalBody?.trim(), + stripInlineDirectives(params.originalBody), + MEDIA_ONLY_PLACEHOLDER, + ].filter( + (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index, + ); + + let rebuilt = stripped; + for (const target of replacementTargets) { + const replaced = replaceLastOccurrence(rebuilt, target, updatedBody); + if (replaced !== undefined) { + rebuilt = replaced; + return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); + } + } + + rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n"); + return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); +} + export function createFollowupRunner(params: { opts?: GetReplyOptions; typing: TypingController; @@ -170,6 +250,7 @@ export function createFollowupRunner(params: { if (hasMedia) { try { const mediaCtx = { ...queued.mediaContext } as MsgContext; + const originalBody = mediaCtx.Body; const muResult = await applyMediaUnderstanding({ ctx: mediaCtx, cfg: queued.run.config, @@ -179,34 +260,19 @@ export function createFollowupRunner(params: { model: queued.run.model, }, }); - if (muResult.outputs.length > 0) { - // Rebuild the prompt with media understanding results baked in, - // matching the primary path's formatting. + if (muResult.outputs.length > 0 || muResult.appliedFile) { + // Rebuild the queued prompt from the mutated media context so the + // deferred path matches the primary path's prompt shape. const newMediaNote = buildInboundMediaNote(mediaCtx); - const transcriptBody = formatMediaUnderstandingBody({ - body: undefined, - outputs: muResult.outputs, + queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({ + prompt: queued.prompt, + originalBody, + updatedBody: mediaCtx.Body, + mediaNote: newMediaNote, }); - // Strip existing [media attached ...] lines from the prompt so - // they can be replaced by the updated media note (which excludes - // successfully-understood attachments like transcribed audio). - const stripped = queued.prompt - .replace(/\[media attached: \d+ files\]\n?/g, "") - .replace(/\[media attached[^\]]*\]\n?/g, ""); - - const parts: string[] = []; - if (newMediaNote) { - parts.push(newMediaNote); - } - if (transcriptBody) { - parts.push(transcriptBody); - } - parts.push(stripped.trim()); - queued.prompt = parts.filter(Boolean).join("\n\n"); - logVerbose( - `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage})`, + `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`, ); } } catch (err) { From dd19a53a8ed6a3156cc49c06c1ebd3d876690d44 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 15:45:17 -0400 Subject: [PATCH 03/16] fix: rebuild queued followup media prompts --- src/auto-reply/reply/followup-runner.test.ts | 180 +++++++++++++++++++ src/auto-reply/reply/followup-runner.ts | 21 ++- 2 files changed, 196 insertions(+), 5 deletions(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index 1faac44a8a3..f866c664763 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -653,6 +653,138 @@ describe("createFollowupRunner media understanding", () => { expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" })); }); + it("strips the full media line when attachment paths or URLs contain brackets", async () => { + const transcriptText = "Bracket-safe transcript"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "done" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: + "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg]\nsome text", + mediaContext: { + Body: "some text", + CommandBody: "some text", + RawBody: "some text", + MediaPaths: ["/tmp/voice[0].ogg"], + MediaUrls: ["https://cdn.example.com/files[0].ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(transcriptText); + expect(agentCall?.prompt).not.toContain("[media attached:"); + expect(agentCall?.prompt).not.toContain("files[0].ogg]"); + }); + + it("only strips leading synthetic media lines and preserves literal user text later in the prompt", async () => { + const transcriptText = "Transcript with literal token"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = "I literally typed [media attached: keep me] in this message."; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "done" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: + "[media attached: /tmp/voice.ogg (audio/ogg)]\nI literally typed [media attached: keep me] in this message.", + mediaContext: { + Body: "I literally typed [media attached: keep me] in this message.", + CommandBody: "I literally typed [media attached: keep me] in this message.", + RawBody: "I literally typed [media attached: keep me] in this message.", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain( + "I literally typed [media attached: keep me] in this message.", + ); + expect(agentCall?.prompt).not.toContain("[media attached: /tmp/voice.ogg (audio/ogg)]"); + }); + it("skips media understanding when MediaUnderstanding is already populated", async () => { runEmbeddedPiAgentMock.mockResolvedValueOnce({ payloads: [{ text: "reply" }], @@ -755,6 +887,54 @@ describe("createFollowupRunner media understanding", () => { expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "fallback reply" })); }); + it("rebuilds the prompt when file extraction succeeds without media outputs", async () => { + const fileBlock = '\nline one\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = `some text\n\n${fileBlock}`; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "file processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[media attached: /tmp/notes.txt (text/plain)]\nsome text", + mediaContext: { + Body: "some text", + CommandBody: "some text", + RawBody: "some text", + MediaPaths: ["/tmp/notes.txt"], + MediaTypes: ["text/plain"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("[media attached: /tmp/notes.txt (text/plain)]"); + expect(agentCall?.prompt).toContain(fileBlock); + expect(agentCall?.prompt?.match(/ { applyMediaUnderstandingMock.mockImplementationOnce( async (params: { ctx: Record }) => { diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index 1688ffe9ccc..45534879e78 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -39,13 +39,16 @@ import type { TypingController } from "./typing.js"; const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; +const LEADING_MEDIA_ATTACHED_LINE_RE = + /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/; +const FILE_BLOCK_RE = / 0 || muResult.appliedFile) { + const shouldRebuildPrompt = + muResult.outputs.length > 0 || + (muResult.appliedFile && !FILE_BLOCK_RE.test(queued.prompt)); + if (shouldRebuildPrompt) { // Rebuild the queued prompt from the mutated media context so the // deferred path matches the primary path's prompt shape. const newMediaNote = buildInboundMediaNote(mediaCtx); @@ -270,7 +282,6 @@ export function createFollowupRunner(params: { updatedBody: mediaCtx.Body, mediaNote: newMediaNote, }); - logVerbose( `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`, ); From 9dd7e2c791f0276803df166d18474f915a34c29a Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 16:56:36 -0400 Subject: [PATCH 04/16] fix: narrow FILE_BLOCK_RE, align originalBody, check body not prompt --- src/auto-reply/reply/followup-runner.test.ts | 109 +++++++++++++++++++ src/auto-reply/reply/followup-runner.ts | 21 ++-- 2 files changed, 123 insertions(+), 7 deletions(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index f866c664763..1956b450fa8 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -1163,4 +1163,113 @@ describe("createFollowupRunner media understanding", () => { expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this"); expect(agentCall?.prompt).not.toContain("/think high summarize this"); }); + + it("does not false-positive on user text containing literal ' { + const fileBlock = '\ncol1,col2\n1,2\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = `check my {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + // User message contains literal " { + const fileBlock = '\nreport content\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + // applyMediaUnderstanding mutates the resolved body (which is CommandBody) + params.ctx.Body = `summarize this\n\n${fileBlock}`; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + // Body has directive prefix; CommandBody has the cleaned version. + // The prompt was built from CommandBody, so originalBody should match CommandBody + // for accurate replacement. + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`, + mediaContext: { + Body: "/think high summarize this", + CommandBody: "summarize this", + RawBody: "/think high summarize this", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + // File block should be present (extraction succeeded) + expect(agentCall?.prompt).toContain(fileBlock); + // The body text should appear once, not duplicated + expect(agentCall?.prompt).toContain("summarize this"); + // Should NOT contain the directive prefix + expect(agentCall?.prompt).not.toContain("/think high"); + // The body should not be duplicated (would happen if originalBody didn't match) + const matches = agentCall?.prompt?.match(/summarize this/g); + expect(matches?.length).toBe(1); + }); }); diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index 45534879e78..d2a2d4efd44 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -41,7 +41,7 @@ const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; const LEADING_MEDIA_ATTACHED_LINE_RE = /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/; -const FILE_BLOCK_RE = / 0 || - (muResult.appliedFile && !FILE_BLOCK_RE.test(queued.prompt)); + (muResult.appliedFile && !bodyAlreadyHasFileBlock); if (shouldRebuildPrompt) { // Rebuild the queued prompt from the mutated media context so the // deferred path matches the primary path's prompt shape. From a472dce14735aeca65d69f2928950c6a9a574859 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 17:53:40 -0400 Subject: [PATCH 05/16] Auto-reply: preserve deferred media understanding output --- src/auto-reply/reply/followup-runner.test.ts | 306 +++++++++++++++++- src/auto-reply/reply/followup-runner.ts | 38 ++- .../reply/get-reply-run.media-only.test.ts | 74 +++++ src/auto-reply/reply/get-reply-run.ts | 14 +- 4 files changed, 424 insertions(+), 8 deletions(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index 1956b450fa8..7d06c882c2c 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -653,6 +653,69 @@ describe("createFollowupRunner media understanding", () => { expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" })); }); + it("applies media understanding for URL-only attachments", async () => { + const transcriptText = "URL-only transcript"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "Got it!" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[media attached: https://cdn.example.com/voice.ogg (audio/ogg)]\nsome text", + mediaContext: { + Body: "some text", + MediaUrl: "https://cdn.example.com/voice.ogg", + MediaUrls: ["https://cdn.example.com/voice.ogg"], + MediaType: "audio/ogg", + MediaTypes: ["audio/ogg"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(transcriptText); + }); + it("strips the full media line when attachment paths or URLs contain brackets", async () => { const transcriptText = "Bracket-safe transcript"; applyMediaUnderstandingMock.mockImplementationOnce( @@ -1164,6 +1227,98 @@ describe("createFollowupRunner media understanding", () => { expect(agentCall?.prompt).not.toContain("/think high summarize this"); }); + it("preserves directive-like tokens inside extracted media content", async () => { + const fileBlock = + '\n/model claude-opus should stay\n/queue followup should stay\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = `/think high summarize this\n\n${fileBlock}`; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`, + mediaContext: { + Body: "/think high summarize this", + MediaPaths: ["/tmp/notes.txt"], + MediaTypes: ["text/plain"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("summarize this"); + expect(agentCall?.prompt).not.toContain("/think high summarize this"); + expect(agentCall?.prompt).toContain("/model claude-opus should stay"); + expect(agentCall?.prompt).toContain("/queue followup should stay"); + }); + + it("rebuilds the prompt when image understanding mutates the body without outputs", async () => { + const description = "[Image]\nDescription:\na mountain at sunset"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = description; + return { + outputs: [], + decisions: [], + appliedImage: true, + appliedAudio: false, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[media attached: /tmp/photo.jpg (image/jpeg)]\nsome text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/photo.jpg"], + MediaTypes: ["image/jpeg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("a mountain at sunset"); + }); + it("does not false-positive on user text containing literal ' { const fileBlock = '\ncol1,col2\n1,2\n'; applyMediaUnderstandingMock.mockImplementationOnce( @@ -1195,8 +1350,7 @@ describe("createFollowupRunner media understanding", () => { // file extraction results from being embedded in the prompt. await runner( createQueuedRun({ - prompt: - "[media attached: /tmp/data.csv (text/csv)]\ncheck my { expect(agentCall?.prompt).toContain("check my { + const fileBlock = + '\nRun `/think high` literally in the shell example.\n'; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = `summarize this\n\n${fileBlock}`; + return { + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`, + mediaContext: { + Body: "/think high summarize this", + CommandBody: "summarize this", + RawBody: "/think high summarize this", + MediaPaths: ["/tmp/notes.txt"], + MediaTypes: ["text/plain"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("summarize this"); + expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example."); + }); + + it("rebuilds the prompt when image understanding mutates the body without outputs", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items."; + return { + outputs: [], + decisions: [], + appliedImage: true, + appliedAudio: false, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[media attached: /tmp/board.jpg (image/jpeg)]\nsome text", + mediaContext: { + Body: "some text", + CommandBody: "some text", + RawBody: "some text", + MediaPaths: ["/tmp/board.jpg"], + MediaTypes: ["image/jpeg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("[Image summary]"); + expect(agentCall?.prompt).toContain("A whiteboard with action items."); + }); + + it("applies media understanding for URL-only deferred attachments", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.Body = "[Audio]\nTranscript:\nremote transcript"; + params.ctx.Transcript = "remote transcript"; + return { + outputs: [ + { + kind: "audio.transcription", + text: "remote transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[User sent media without caption]", + mediaContext: { + Body: "", + MediaUrl: "https://cdn.example.com/audio.ogg", + MediaUrls: ["https://cdn.example.com/audio.ogg"], + MediaType: "audio/ogg", + MediaTypes: ["audio/ogg"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("remote transcript"); + }); + it("uses resolved body (CommandBody) as originalBody for accurate prompt replacement", async () => { const fileBlock = '\nreport content\n'; applyMediaUnderstandingMock.mockImplementationOnce( diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index d2a2d4efd44..87e2f975ce0 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -39,8 +39,7 @@ import type { TypingController } from "./typing.js"; const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; -const LEADING_MEDIA_ATTACHED_LINE_RE = - /^(?:\[media attached: \d+ files\]|\[media attached(?: \d+\/\d+)?: [^\r\n]*\])$/; +const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/; const FILE_BLOCK_RE = / 0), + queued.mediaContext.MediaPaths.length > 0) || + (Array.isArray(queued.mediaContext.MediaUrls) && + queued.mediaContext.MediaUrls.length > 0), ); if (hasMedia) { try { @@ -278,6 +305,9 @@ export function createFollowupRunner(params: { }); const shouldRebuildPrompt = muResult.outputs.length > 0 || + muResult.appliedAudio || + muResult.appliedImage || + muResult.appliedVideo || (muResult.appliedFile && !bodyAlreadyHasFileBlock); if (shouldRebuildPrompt) { // Rebuild the queued prompt from the mutated media context so the diff --git a/src/auto-reply/reply/get-reply-run.media-only.test.ts b/src/auto-reply/reply/get-reply-run.media-only.test.ts index 829b3937009..f519da10082 100644 --- a/src/auto-reply/reply/get-reply-run.media-only.test.ts +++ b/src/auto-reply/reply/get-reply-run.media-only.test.ts @@ -172,6 +172,45 @@ describe("runPreparedReply media-only handling", () => { expect(call?.followupRun.prompt).toContain("[User sent media without caption]"); }); + it("snapshots URL-only attachments into followup mediaContext", async () => { + await runPreparedReply( + baseParams({ + ctx: { + Body: "check this attachment", + RawBody: "check this attachment", + CommandBody: "check this attachment", + ThreadHistoryBody: "Earlier message in this thread", + OriginatingChannel: "slack", + OriginatingTo: "C123", + ChatType: "group", + MediaUrl: "https://cdn.example.com/input.png", + MediaUrls: ["https://cdn.example.com/input.png"], + MediaType: "image/png", + MediaTypes: ["image/png"], + }, + sessionCtx: { + Body: "check this attachment", + BodyStripped: "check this attachment", + ThreadHistoryBody: "Earlier message in this thread", + Provider: "slack", + ChatType: "group", + OriginatingChannel: "slack", + OriginatingTo: "C123", + }, + }), + ); + + const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0]; + expect(call?.followupRun.mediaContext).toEqual( + expect.objectContaining({ + MediaUrl: "https://cdn.example.com/input.png", + MediaUrls: ["https://cdn.example.com/input.png"], + MediaType: "image/png", + MediaTypes: ["image/png"], + }), + ); + }); + it("keeps thread history context on follow-up turns", async () => { const result = await runPreparedReply( baseParams({ @@ -186,6 +225,41 @@ describe("runPreparedReply media-only handling", () => { expect(call?.followupRun.prompt).toContain("Earlier message in this thread"); }); + it("snapshots mediaContext for URL-only deferred attachments", async () => { + await runPreparedReply( + baseParams({ + ctx: { + Body: "", + RawBody: "", + CommandBody: "", + MediaUrl: "https://cdn.example.com/audio.ogg", + MediaUrls: ["https://cdn.example.com/audio.ogg"], + MediaType: "audio/ogg", + MediaTypes: ["audio/ogg"], + ThreadHistoryBody: "Earlier message in this thread", + OriginatingChannel: "slack", + OriginatingTo: "C123", + ChatType: "group", + }, + sessionCtx: { + Body: "", + BodyStripped: "", + ThreadHistoryBody: "Earlier message in this thread", + Provider: "slack", + ChatType: "group", + OriginatingChannel: "slack", + OriginatingTo: "C123", + }, + }), + ); + + const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0]; + expect(call?.followupRun.mediaContext?.MediaUrl).toBe("https://cdn.example.com/audio.ogg"); + expect(call?.followupRun.mediaContext?.MediaUrls).toEqual([ + "https://cdn.example.com/audio.ogg", + ]); + }); + it("returns the empty-body reply when there is no text and no media", async () => { const result = await runPreparedReply( baseParams({ diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts index 4cf76e6ed31..32999a66238 100644 --- a/src/auto-reply/reply/get-reply-run.ts +++ b/src/auto-reply/reply/get-reply-run.ts @@ -307,7 +307,14 @@ export async function runPreparedReply( : [inboundUserContext, baseBodyFinal].filter(Boolean).join("\n\n"); const baseBodyTrimmed = baseBodyForPrompt.trim(); const hasMediaAttachment = Boolean( - sessionCtx.MediaPath || (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0), + sessionCtx.MediaPath || + sessionCtx.MediaUrl || + (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0) || + (sessionCtx.MediaUrls && sessionCtx.MediaUrls.length > 0) || + ctx.MediaPath?.trim() || + ctx.MediaUrl?.trim() || + (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) || + (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0), ); if (!baseBodyTrimmed && !hasMediaAttachment) { await typing.onReplyStart(); @@ -473,7 +480,10 @@ export async function runPreparedReply( // followup runner. When MediaUnderstanding is already populated the runner // knows transcription already succeeded and skips re-application. const hasMediaAttachments = Boolean( - ctx.MediaPath?.trim() || (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0), + ctx.MediaPath?.trim() || + ctx.MediaUrl?.trim() || + (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) || + (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0), ); const mediaContext = hasMediaAttachments ? { From 8a5ad5e320beb0b5d50fb8ab8a07ee246a754b01 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 21:54:35 -0400 Subject: [PATCH 06/16] Reply: preserve deferred queued media context --- src/auto-reply/reply/followup-media.ts | 241 +++++++++++++++++++ src/auto-reply/reply/followup-runner.test.ts | 225 ++++++++++++++++- src/auto-reply/reply/followup-runner.ts | 180 +------------- src/auto-reply/reply/get-reply-run.ts | 2 + src/auto-reply/reply/queue/drain.ts | 71 +++++- src/auto-reply/reply/queue/enqueue.ts | 37 ++- src/auto-reply/reply/queue/state.ts | 3 + src/auto-reply/reply/queue/types.ts | 3 + 8 files changed, 572 insertions(+), 190 deletions(-) create mode 100644 src/auto-reply/reply/followup-media.ts diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts new file mode 100644 index 00000000000..5a014e63f9b --- /dev/null +++ b/src/auto-reply/reply/followup-media.ts @@ -0,0 +1,241 @@ +import { logVerbose } from "../../globals.js"; +import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; +import { + normalizeAttachments, + resolveAttachmentKind, +} from "../../media-understanding/attachments.js"; +import { buildInboundMediaNote } from "../media-note.js"; +import type { MsgContext } from "../templating.js"; +import { parseInlineDirectives } from "./directive-handling.js"; +import type { FollowupMediaContext, FollowupRun } from "./queue/types.js"; + +const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; +const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; +const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/; +const FILE_BLOCK_RE = / Boolean(value) && list.indexOf(value) === index, + ); + + let rebuilt = stripped; + for (const target of replacementTargets) { + const replaced = replaceLastOccurrence(rebuilt, target, updatedBody); + if (replaced !== undefined) { + rebuilt = replaced; + return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); + } + } + + rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n"); + return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); +} + +function hasMediaAttachments(mediaContext: FollowupMediaContext): boolean { + return Boolean( + mediaContext.MediaPath?.trim() || + mediaContext.MediaUrl?.trim() || + (Array.isArray(mediaContext.MediaPaths) && mediaContext.MediaPaths.length > 0) || + (Array.isArray(mediaContext.MediaUrls) && mediaContext.MediaUrls.length > 0), + ); +} + +function hasOnlyFileLikeAttachments(mediaContext: FollowupMediaContext): boolean { + const attachments = normalizeAttachments(mediaContext as MsgContext); + return ( + attachments.length > 0 && + attachments.every((attachment) => { + const kind = resolveAttachmentKind(attachment); + return kind !== "audio" && kind !== "image" && kind !== "video"; + }) + ); +} + +function snapshotUpdatedMediaContext(params: { + original: FollowupMediaContext; + mediaCtx: MsgContext; + updatedBody?: string; +}): FollowupMediaContext { + return { + ...params.original, + Body: params.updatedBody ?? params.original.Body, + Transcript: + typeof params.mediaCtx.Transcript === "string" + ? params.mediaCtx.Transcript + : params.original.Transcript, + MediaUnderstanding: Array.isArray(params.mediaCtx.MediaUnderstanding) + ? [...params.mediaCtx.MediaUnderstanding] + : params.original.MediaUnderstanding, + MediaUnderstandingDecisions: Array.isArray(params.mediaCtx.MediaUnderstandingDecisions) + ? [...params.mediaCtx.MediaUnderstandingDecisions] + : params.original.MediaUnderstandingDecisions, + DeferredMediaApplied: true, + }; +} + +export async function applyDeferredMediaUnderstandingToQueuedRun( + queued: FollowupRun, + params: { logLabel?: string } = {}, +): Promise { + const mediaContext = queued.mediaContext; + if (!mediaContext || mediaContext.DeferredMediaApplied) { + return; + } + if (mediaContext.MediaUnderstanding?.length) { + mediaContext.DeferredMediaApplied = true; + return; + } + if (!hasMediaAttachments(mediaContext)) { + mediaContext.DeferredMediaApplied = true; + return; + } + + const resolvedOriginalBody = + mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body; + const bodyAlreadyHasFileBlock = + FILE_BLOCK_RE.test(resolvedOriginalBody ?? "") || FILE_BLOCK_RE.test(mediaContext.Body ?? ""); + + if (bodyAlreadyHasFileBlock && hasOnlyFileLikeAttachments(mediaContext)) { + mediaContext.DeferredMediaApplied = true; + return; + } + + try { + const mediaCtx = { + ...mediaContext, + Body: resolvedOriginalBody, + Provider: + mediaContext.Provider ?? + queued.run.messageProvider ?? + (typeof mediaContext.OriginatingChannel === "string" + ? mediaContext.OriginatingChannel + : undefined), + Surface: mediaContext.Surface, + } as MsgContext; + + const muResult = await applyMediaUnderstanding({ + ctx: mediaCtx, + cfg: queued.run.config, + agentDir: queued.run.agentDir, + activeModel: { + provider: queued.run.provider, + model: queued.run.model, + }, + }); + + const shouldRebuildPrompt = + muResult.outputs.length > 0 || + muResult.appliedAudio || + muResult.appliedImage || + muResult.appliedVideo || + (muResult.appliedFile && !bodyAlreadyHasFileBlock); + + if (shouldRebuildPrompt) { + const newMediaNote = buildInboundMediaNote(mediaCtx); + queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({ + prompt: queued.prompt, + originalBody: resolvedOriginalBody, + updatedBody: mediaCtx.Body, + mediaNote: newMediaNote, + }); + logVerbose( + `${params.logLabel ?? "followup"}: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`, + ); + } + + queued.mediaContext = snapshotUpdatedMediaContext({ + original: mediaContext, + mediaCtx, + updatedBody: shouldRebuildPrompt ? mediaCtx.Body : undefined, + }); + } catch (err) { + logVerbose( + `${params.logLabel ?? "followup"}: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`, + ); + } +} diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index a8e31f998de..ea7448b74e5 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -3,7 +3,8 @@ import { tmpdir } from "node:os"; import path from "node:path"; import { beforeEach, describe, expect, it, vi } from "vitest"; import { loadSessionStore, type SessionEntry, saveSessionStore } from "../../config/sessions.js"; -import type { FollowupRun } from "./queue.js"; +import { buildCollectPrompt } from "../../utils/queue-helpers.js"; +import type { FollowupRun } from "./queue/types.js"; import { createMockFollowupRun, createMockTypingController } from "./test-helpers.js"; const runEmbeddedPiAgentMock = vi.fn(); @@ -34,6 +35,10 @@ vi.mock("./route-reply.js", async (importOriginal) => { }); import { createFollowupRunner } from "./followup-runner.js"; +import { + applyDeferredMediaToQueuedRuns, + buildMediaAwareQueueSummaryPrompt, +} from "./queue/drain.js"; const ROUTABLE_TEST_CHANNELS = new Set([ "telegram", @@ -757,6 +762,69 @@ describe("createFollowupRunner media understanding", () => { expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" })); }); + it("propagates the queued message provider into deferred media context", async () => { + const transcriptText = "Provider-aware transcript"; + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + expect(params.ctx.Provider).toBe("telegram"); + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "done" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: "[User sent media without caption]", + run: { messageProvider: "telegram" }, + mediaContext: { + Body: "", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(transcriptText); + }); + it("applies media understanding for URL-only attachments", async () => { const transcriptText = "URL-only transcript"; applyMediaUnderstandingMock.mockImplementationOnce( @@ -1678,4 +1746,159 @@ describe("createFollowupRunner media understanding", () => { const matches = agentCall?.prompt?.match(/summarize this/g); expect(matches?.length).toBe(1); }); + + it("does not re-apply file extraction when the stored media body already has a file block", async () => { + const fileBlock = '\nreport content\n'; + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${fileBlock}`, + mediaContext: { + Body: `summarize this\n\n${fileBlock}`, + CommandBody: "summarize this", + RawBody: "summarize this", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).not.toHaveBeenCalled(); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt?.match(/ { + it("preprocesses collect batches before synthesizing the followup prompt", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: "collect transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = "collect transcript"; + params.ctx.Body = "[Audio]\nTranscript:\ncollect transcript"; + return { + outputs: [ + { + kind: "audio.transcription", + text: "collect transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + const items: FollowupRun[] = [ + createQueuedRun({ + prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text", + summaryLine: "some text", + originatingChannel: "telegram", + originatingTo: "chat:1", + run: { messageProvider: "telegram" }, + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + createQueuedRun({ + prompt: "second text", + summaryLine: "second text", + originatingChannel: "telegram", + originatingTo: "chat:1", + run: { messageProvider: "telegram" }, + }), + ]; + + await applyDeferredMediaToQueuedRuns(items); + + const prompt = buildCollectPrompt({ + title: "[Queued messages while agent was busy]", + items, + renderItem: (item, idx) => `---\nQueued #${idx + 1}\n${item.prompt}`.trim(), + }); + + expect(prompt).toContain("collect transcript"); + expect(prompt).toContain("Queued #2\nsecond text"); + expect(prompt).not.toContain("[media attached: /tmp/voice.ogg"); + }); + + it("preprocesses dropped media items before building overflow summaries", async () => { + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: "overflow transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = "overflow transcript"; + params.ctx.Body = "[Audio]\nTranscript:\noverflow transcript"; + return { + outputs: [ + { + kind: "audio.transcription", + text: "overflow transcript", + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + const summaryPrompt = await buildMediaAwareQueueSummaryPrompt({ + dropPolicy: "summarize", + droppedCount: 1, + summaryLines: ["[media attached: /tmp/voice.ogg (audio/ogg)]"], + summaryItems: [ + createQueuedRun({ + prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]", + summaryLine: "", + run: { messageProvider: "telegram" }, + mediaContext: { + Body: "", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + ], + noun: "message", + }); + + expect(summaryPrompt).toContain("[Queue overflow] Dropped 1 message due to cap."); + expect(summaryPrompt).toContain("overflow transcript"); + expect(summaryPrompt).not.toContain("[media attached: /tmp/voice.ogg"); + }); }); diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts index d10d63d6389..50925cd1269 100644 --- a/src/auto-reply/reply/followup-runner.ts +++ b/src/auto-reply/reply/followup-runner.ts @@ -9,16 +9,14 @@ import type { SessionEntry } from "../../config/sessions.js"; import type { TypingMode } from "../../config/types.js"; import { logVerbose } from "../../globals.js"; import { registerAgentRunContext } from "../../infra/agent-events.js"; -import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; import { defaultRuntime } from "../../runtime.js"; import { isInternalMessageChannel } from "../../utils/message-channel.js"; import { stripHeartbeatToken } from "../heartbeat.js"; -import { buildInboundMediaNote } from "../media-note.js"; -import type { MsgContext, OriginatingChannelType } from "../templating.js"; +import type { OriginatingChannelType } from "../templating.js"; import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js"; import type { GetReplyOptions, ReplyPayload } from "../types.js"; import { resolveRunAuthProfile } from "./agent-runner-utils.js"; -import { parseInlineDirectives } from "./directive-handling.js"; +import { applyDeferredMediaUnderstandingToQueuedRun } from "./followup-media.js"; import { resolveOriginAccountId, resolveOriginMessageProvider, @@ -37,113 +35,6 @@ import { incrementRunCompactionCount, persistRunSessionUsage } from "./session-r import { createTypingSignaler } from "./typing-mode.js"; import type { TypingController } from "./typing.js"; -const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; -const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; -const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/; -const FILE_BLOCK_RE = / Boolean(value) && list.indexOf(value) === index, - ); - - let rebuilt = stripped; - for (const target of replacementTargets) { - const replaced = replaceLastOccurrence(rebuilt, target, updatedBody); - if (replaced !== undefined) { - rebuilt = replaced; - return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); - } - } - - rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n"); - return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); -} - export function createFollowupRunner(params: { opts?: GetReplyOptions; typing: TypingController; @@ -264,72 +155,7 @@ export function createFollowupRunner(params: { let bootstrapPromptWarningSignaturesSeen = resolveBootstrapWarningSignaturesSeen( activeSessionEntry?.systemPromptReport, ); - - // Apply media understanding for followup-queued messages when it was - // not applied (or failed) in the primary path. This ensures voice - // notes that arrived while the agent was mid-turn still get transcribed. - if (queued.mediaContext && !queued.mediaContext.MediaUnderstanding?.length) { - const hasMedia = Boolean( - queued.mediaContext.MediaPath?.trim() || - queued.mediaContext.MediaUrl?.trim() || - (Array.isArray(queued.mediaContext.MediaPaths) && - queued.mediaContext.MediaPaths.length > 0) || - (Array.isArray(queued.mediaContext.MediaUrls) && - queued.mediaContext.MediaUrls.length > 0), - ); - if (hasMedia) { - try { - const resolvedOriginalBody = - queued.mediaContext.CommandBody ?? - queued.mediaContext.RawBody ?? - queued.mediaContext.Body; - const mediaCtx = { - ...queued.mediaContext, - Body: resolvedOriginalBody, - } as MsgContext; - const originalBody = resolvedOriginalBody; - // Capture whether the resolved body already contains a file block - // BEFORE applyMediaUnderstanding mutates it — this detects prior - // extraction so we avoid double-inserting. Checking the body - // (not the full queued.prompt) avoids false positives from user - // messages that happen to contain literal " 0 || - muResult.appliedAudio || - muResult.appliedImage || - muResult.appliedVideo || - (muResult.appliedFile && !bodyAlreadyHasFileBlock); - if (shouldRebuildPrompt) { - // Rebuild the queued prompt from the mutated media context so the - // deferred path matches the primary path's prompt shape. - const newMediaNote = buildInboundMediaNote(mediaCtx); - queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({ - prompt: queued.prompt, - originalBody, - updatedBody: mediaCtx.Body, - mediaNote: newMediaNote, - }); - logVerbose( - `followup: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`, - ); - } - } catch (err) { - logVerbose( - `followup: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`, - ); - } - } - } + await applyDeferredMediaUnderstandingToQueuedRun(queued, { logLabel: "followup" }); try { const fallbackResult = await runWithModelFallback({ diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts index 32999a66238..92ca1be7513 100644 --- a/src/auto-reply/reply/get-reply-run.ts +++ b/src/auto-reply/reply/get-reply-run.ts @@ -490,6 +490,8 @@ export async function runPreparedReply( Body: ctx.Body, CommandBody: ctx.CommandBody, RawBody: ctx.RawBody, + Provider: ctx.Provider ?? sessionCtx.Provider, + Surface: ctx.Surface ?? sessionCtx.Surface, MediaPath: ctx.MediaPath, MediaUrl: ctx.MediaUrl, MediaType: ctx.MediaType, diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts index 1e2fb33e4e0..68c660fe2b8 100644 --- a/src/auto-reply/reply/queue/drain.ts +++ b/src/auto-reply/reply/queue/drain.ts @@ -3,15 +3,17 @@ import { resolveGlobalMap } from "../../../shared/global-singleton.js"; import { buildCollectPrompt, beginQueueDrain, + buildQueueSummaryLine, + buildQueueSummaryPrompt, clearQueueSummaryState, drainCollectQueueStep, drainNextQueueItem, hasCrossChannelItems, - previewQueueSummaryPrompt, waitForQueueDebounce, } from "../../../utils/queue-helpers.js"; +import { applyDeferredMediaUnderstandingToQueuedRun } from "../followup-media.js"; import { isRoutableChannel } from "../route-reply.js"; -import { FOLLOWUP_QUEUES } from "./state.js"; +import { FOLLOWUP_QUEUES, type FollowupQueueState } from "./state.js"; import type { FollowupRun } from "./types.js"; // Persists the most recent runFollowup callback per queue key so that @@ -68,6 +70,50 @@ function resolveCrossChannelKey(item: FollowupRun): { cross?: true; key?: string }; } +function clearFollowupQueueSummaryState(queue: FollowupQueueState): void { + clearQueueSummaryState(queue); + queue.summaryItems = []; +} + +export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Promise { + for (const item of items) { + await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }); + } +} + +async function resolveSummaryLines(items: FollowupRun[]): Promise { + const summaryLines: string[] = []; + for (const item of items) { + await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }); + summaryLines.push(buildQueueSummaryLine(item.summaryLine?.trim() || item.prompt.trim())); + } + return summaryLines; +} + +export async function buildMediaAwareQueueSummaryPrompt(params: { + dropPolicy: FollowupQueueState["dropPolicy"]; + droppedCount: number; + summaryLines: string[]; + summaryItems: FollowupRun[]; + noun: string; +}): Promise { + if (params.dropPolicy !== "summarize" || params.droppedCount <= 0) { + return undefined; + } + const summaryLines = + params.summaryItems.length > 0 + ? await resolveSummaryLines(params.summaryItems) + : params.summaryLines; + return buildQueueSummaryPrompt({ + state: { + dropPolicy: params.dropPolicy, + droppedCount: params.droppedCount, + summaryLines: [...summaryLines], + }, + noun: params.noun, + }); +} + export function scheduleFollowupDrain( key: string, runFollowup: (run: FollowupRun) => Promise, @@ -107,7 +153,14 @@ export function scheduleFollowupDrain( } const items = queue.items.slice(); - const summary = previewQueueSummaryPrompt({ state: queue, noun: "message" }); + await applyDeferredMediaToQueuedRuns(items); + const summary = await buildMediaAwareQueueSummaryPrompt({ + dropPolicy: queue.dropPolicy, + droppedCount: queue.droppedCount, + summaryLines: queue.summaryLines, + summaryItems: queue.summaryItems, + noun: "message", + }); const run = items.at(-1)?.run ?? queue.lastRun; if (!run) { break; @@ -129,12 +182,18 @@ export function scheduleFollowupDrain( }); queue.items.splice(0, items.length); if (summary) { - clearQueueSummaryState(queue); + clearFollowupQueueSummaryState(queue); } continue; } - const summaryPrompt = previewQueueSummaryPrompt({ state: queue, noun: "message" }); + const summaryPrompt = await buildMediaAwareQueueSummaryPrompt({ + dropPolicy: queue.dropPolicy, + droppedCount: queue.droppedCount, + summaryLines: queue.summaryLines, + summaryItems: queue.summaryItems, + noun: "message", + }); if (summaryPrompt) { const run = queue.lastRun; if (!run) { @@ -155,7 +214,7 @@ export function scheduleFollowupDrain( ) { break; } - clearQueueSummaryState(queue); + clearFollowupQueueSummaryState(queue); continue; } diff --git a/src/auto-reply/reply/queue/enqueue.ts b/src/auto-reply/reply/queue/enqueue.ts index 11da0db98fc..e58cc5ffac5 100644 --- a/src/auto-reply/reply/queue/enqueue.ts +++ b/src/auto-reply/reply/queue/enqueue.ts @@ -1,8 +1,8 @@ import { createDedupeCache } from "../../../infra/dedupe.js"; import { resolveGlobalSingleton } from "../../../shared/global-singleton.js"; -import { applyQueueDropPolicy, shouldSkipQueueItem } from "../../../utils/queue-helpers.js"; +import { buildQueueSummaryLine, shouldSkipQueueItem } from "../../../utils/queue-helpers.js"; import { kickFollowupDrainIfIdle } from "./drain.js"; -import { getExistingFollowupQueue, getFollowupQueue } from "./state.js"; +import { getExistingFollowupQueue, getFollowupQueue, type FollowupQueueState } from "./state.js"; import type { FollowupRun, QueueDedupeMode, QueueSettings } from "./types.js"; /** @@ -57,6 +57,34 @@ function isRunAlreadyQueued( return items.some((item) => item.prompt === run.prompt && hasSameRouting(item)); } +function applyFollowupQueueDropPolicy(queue: FollowupQueueState): boolean { + const cap = queue.cap; + if (cap <= 0 || queue.items.length < cap) { + return true; + } + if (queue.dropPolicy === "new") { + return false; + } + + const dropCount = queue.items.length - cap + 1; + const dropped = queue.items.splice(0, dropCount); + if (queue.dropPolicy === "summarize") { + for (const item of dropped) { + queue.droppedCount += 1; + queue.summaryItems.push(item); + queue.summaryLines.push( + buildQueueSummaryLine(item.summaryLine?.trim() || item.prompt.trim()), + ); + } + const limit = Math.max(0, cap); + while (queue.summaryLines.length > limit) { + queue.summaryLines.shift(); + queue.summaryItems.shift(); + } + } + return true; +} + export function enqueueFollowupRun( key: string, run: FollowupRun, @@ -83,10 +111,7 @@ export function enqueueFollowupRun( queue.lastEnqueuedAt = Date.now(); queue.lastRun = run.run; - const shouldEnqueue = applyQueueDropPolicy({ - queue, - summarize: (item) => item.summaryLine?.trim() || item.prompt.trim(), - }); + const shouldEnqueue = applyFollowupQueueDropPolicy(queue); if (!shouldEnqueue) { return false; } diff --git a/src/auto-reply/reply/queue/state.ts b/src/auto-reply/reply/queue/state.ts index 44208e727dd..94021dd0c4c 100644 --- a/src/auto-reply/reply/queue/state.ts +++ b/src/auto-reply/reply/queue/state.ts @@ -4,6 +4,7 @@ import type { FollowupRun, QueueDropPolicy, QueueMode, QueueSettings } from "./t export type FollowupQueueState = { items: FollowupRun[]; + summaryItems: FollowupRun[]; draining: boolean; lastEnqueuedAt: number; mode: QueueMode; @@ -47,6 +48,7 @@ export function getFollowupQueue(key: string, settings: QueueSettings): Followup const created: FollowupQueueState = { items: [], + summaryItems: [], draining: false, lastEnqueuedAt: 0, mode: settings.mode, @@ -78,6 +80,7 @@ export function clearFollowupQueue(key: string): number { } const cleared = queue.items.length + queue.droppedCount; queue.items.length = 0; + queue.summaryItems.length = 0; queue.droppedCount = 0; queue.summaryLines = []; queue.lastRun = undefined; diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts index 637ceb0a149..291059a28d7 100644 --- a/src/auto-reply/reply/queue/types.ts +++ b/src/auto-reply/reply/queue/types.ts @@ -32,6 +32,8 @@ export type FollowupMediaContext = { Body?: string; CommandBody?: string; RawBody?: string; + Provider?: string; + Surface?: string; MediaPath?: string; MediaUrl?: string; MediaType?: string; @@ -47,6 +49,7 @@ export type FollowupMediaContext = { OriginatingTo?: string; AccountId?: string; MessageThreadId?: string | number; + DeferredMediaApplied?: boolean; }; export type FollowupRun = { From 48b22eecd0dd7fb50bba83ced450981d16db253f Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 23:03:20 -0400 Subject: [PATCH 07/16] fix: set DeferredMediaApplied on error and strip old file blocks on rebuild --- src/auto-reply/reply/followup-media.ts | 15 +++ src/auto-reply/reply/followup-runner.test.ts | 111 +++++++++++++++++++ 2 files changed, 126 insertions(+) diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts index 5a014e63f9b..5340d0df99a 100644 --- a/src/auto-reply/reply/followup-media.ts +++ b/src/auto-reply/reply/followup-media.ts @@ -13,6 +13,11 @@ const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]"; const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool"; const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/; const FILE_BLOCK_RE = /]*>[\s\S]*?<\/file>\n?/gi; + +function stripExistingFileBlocks(text: string): string { + return text.replace(FILE_BLOCK_FULL_RE, "").trim(); +} function stripLeadingMediaAttachedLines(prompt: string): string { const lines = prompt.split("\n"); @@ -87,6 +92,15 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: { stripped = stripLeadingMediaReplyHint(stripped); } + // Strip pre-existing file blocks from the prompt when the updated body + // contains new file blocks. Mixed messages (audio + PDF) can arrive with + // file extraction already applied in the primary path; without this strip + // the old block stays in the prompt while the updated body adds a new one, + // duplicating potentially large file payloads. + if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) { + stripped = stripExistingFileBlocks(stripped); + } + const updatedBody = normalizeUpdatedBody({ originalBody: params.originalBody, updatedBody: params.updatedBody, @@ -234,6 +248,7 @@ export async function applyDeferredMediaUnderstandingToQueuedRun( updatedBody: shouldRebuildPrompt ? mediaCtx.Body : undefined, }); } catch (err) { + mediaContext.DeferredMediaApplied = true; logVerbose( `${params.logLabel ?? "followup"}: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`, ); diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index ea7448b74e5..3ef621c4a4b 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -1747,6 +1747,117 @@ describe("createFollowupRunner media understanding", () => { expect(matches?.length).toBe(1); }); + it("does not duplicate file blocks for mixed audio+file messages re-processed in followup", async () => { + const existingFileBlock = + '\nold extracted content\n'; + const newFileBlock = + '\nnew extracted content\n'; + const transcriptText = "Mixed message transcript"; + + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nanalyze this\n\n${newFileBlock}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + // Simulate a mixed message where the primary path already extracted the + // PDF (file block is in the prompt) but audio transcription failed. + await runner( + createQueuedRun({ + prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nanalyze this\n\n${existingFileBlock}`, + mediaContext: { + Body: `analyze this\n\n${existingFileBlock}`, + CommandBody: "analyze this", + RawBody: "analyze this", + MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"], + MediaTypes: ["audio/ogg", "application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + // Should contain the transcript + expect(agentCall?.prompt).toContain(transcriptText); + // Should have exactly one file block (the new one), not two + expect(agentCall?.prompt?.match(/ { + applyMediaUnderstandingMock.mockRejectedValueOnce( + new Error("transcription service unavailable"), + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "fallback reply" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + const queued = createQueuedRun({ + prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text", + mediaContext: { + Body: "some text", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }); + + await runner(queued); + + // DeferredMediaApplied should be set so re-runs don't retry + expect(queued.mediaContext?.DeferredMediaApplied).toBe(true); + + // The agent should still be called with the raw prompt + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("some text"); + }); + it("does not re-apply file extraction when the stored media body already has a file block", async () => { const fileBlock = '\nreport content\n'; runEmbeddedPiAgentMock.mockResolvedValueOnce({ From 88ac4bb23800efc09e492b0400dfd787773fed37 Mon Sep 17 00:00:00 2001 From: Joseph Krug Date: Sat, 14 Mar 2026 23:26:27 -0400 Subject: [PATCH 08/16] Update src/auto-reply/reply/followup-runner.test.ts Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- src/auto-reply/reply/followup-runner.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index 3ef621c4a4b..bae9fa98391 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -1589,7 +1589,7 @@ describe("createFollowupRunner media understanding", () => { expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example."); }); - it("rebuilds the prompt when image understanding mutates the body without outputs", async () => { + it("rebuilds the prompt when image understanding mutates the body alongside existing text", async () => { applyMediaUnderstandingMock.mockImplementationOnce( async (params: { ctx: Record }) => { params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items."; From 4c56672e974c271318d4a245c2edefcb3bfed362 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sat, 14 Mar 2026 23:51:22 -0400 Subject: [PATCH 09/16] fix: address remaining review feedback on followup media - Scope file-block stripping to body region only, preserving file blocks in quoted/replied text and thread history above the body - Gate file-extraction skip on mutation evidence (Body differs from resolved original) instead of raw ' list[str]: ] prompts: list[str] = [] for _ in range(count): - prompts.append( - f"{random.choice(styles)} of {random.choice(subjects)}, {random.choice(lighting)}" - ) + prompts.append(f"{random.choice(styles)} of {random.choice(subjects)}, {random.choice(lighting)}") return prompts @@ -100,9 +98,7 @@ def normalize_optional_flag( value = aliases.get(value, value) if value not in allowed: - raise ValueError( - f"Invalid --{flag_name} '{raw_value}'. Allowed values: {allowed_text}." - ) + raise ValueError(f"Invalid --{flag_name} '{raw_value}'. Allowed values: {allowed_text}.") return value @@ -115,10 +111,7 @@ def normalize_background(model: str, background: str) -> str: supported=lambda candidate: candidate.startswith("gpt-image"), allowed={"transparent", "opaque", "auto"}, allowed_text="transparent, opaque, auto", - unsupported_message=( - "Warning: --background is only supported for gpt-image models; " - "ignoring for '{model}'." - ), + unsupported_message=("Warning: --background is only supported for gpt-image models; ignoring for '{model}'."), ) @@ -131,9 +124,7 @@ def normalize_style(model: str, style: str) -> str: supported=lambda candidate: candidate == "dall-e-3", allowed={"vivid", "natural"}, allowed_text="vivid, natural", - unsupported_message=( - "Warning: --style is only supported for dall-e-3; ignoring for '{model}'." - ), + unsupported_message=("Warning: --style is only supported for dall-e-3; ignoring for '{model}'."), ) @@ -147,8 +138,7 @@ def normalize_output_format(model: str, output_format: str) -> str: allowed={"png", "jpeg", "webp"}, allowed_text="png, jpeg, webp", unsupported_message=( - "Warning: --output-format is only supported for gpt-image models; " - "ignoring for '{model}'." + "Warning: --output-format is only supported for gpt-image models; ignoring for '{model}'." ), aliases={"jpg": "jpeg"}, ) @@ -245,9 +235,15 @@ def main() -> int: ap.add_argument("--prompt", help="Single prompt. If omitted, random prompts are generated.") ap.add_argument("--count", type=int, default=8, help="How many images to generate.") ap.add_argument("--model", default="gpt-image-1", help="Image model id.") - ap.add_argument("--size", default="", help="Image size (e.g. 1024x1024, 1536x1024). Defaults based on model if not specified.") - ap.add_argument("--quality", default="", help="Image quality (e.g. high, standard). Defaults based on model if not specified.") - ap.add_argument("--background", default="", help="Background transparency (GPT models only): transparent, opaque, or auto.") + ap.add_argument( + "--size", default="", help="Image size (e.g. 1024x1024, 1536x1024). Defaults based on model if not specified." + ) + ap.add_argument( + "--quality", default="", help="Image quality (e.g. high, standard). Defaults based on model if not specified." + ) + ap.add_argument( + "--background", default="", help="Background transparency (GPT models only): transparent, opaque, or auto." + ) ap.add_argument("--output-format", default="", help="Output format (GPT models only): png, jpeg, or webp.") ap.add_argument("--style", default="", help="Image style (dall-e-3 only): vivid or natural.") ap.add_argument("--out-dir", default="", help="Output directory (default: ./tmp/openai-image-gen-).") @@ -265,7 +261,10 @@ def main() -> int: count = args.count if args.model == "dall-e-3" and count > 1: - print(f"Warning: dall-e-3 only supports generating 1 image at a time. Reducing count from {count} to 1.", file=sys.stderr) + print( + f"Warning: dall-e-3 only supports generating 1 image at a time. Reducing count from {count} to 1.", + file=sys.stderr, + ) count = 1 out_dir = Path(args.out_dir).expanduser() if args.out_dir else default_out_dir() diff --git a/skills/openai-image-gen/scripts/test_gen.py b/skills/openai-image-gen/scripts/test_gen.py index 76445c0bb78..79d418c3e4c 100644 --- a/skills/openai-image-gen/scripts/test_gen.py +++ b/skills/openai-image-gen/scripts/test_gen.py @@ -82,6 +82,8 @@ def test_normalize_output_format_normalizes_case_for_supported_values(): def test_normalize_output_format_strips_whitespace_for_supported_values(): assert normalize_output_format("gpt-image-1", " png ") == "png" + + def test_normalize_output_format_keeps_supported_values(): assert normalize_output_format("gpt-image-1", "png") == "png" assert normalize_output_format("gpt-image-1", "jpeg") == "jpeg" diff --git a/skills/skill-creator/scripts/quick_validate.py b/skills/skill-creator/scripts/quick_validate.py index e8737b4f156..86897b373d7 100644 --- a/skills/skill-creator/scripts/quick_validate.py +++ b/skills/skill-creator/scripts/quick_validate.py @@ -6,7 +6,6 @@ Quick validation script for skills - minimal version import re import sys from pathlib import Path -from typing import Optional try: import yaml @@ -16,7 +15,7 @@ except ModuleNotFoundError: MAX_SKILL_NAME_LENGTH = 64 -def _extract_frontmatter(content: str) -> Optional[str]: +def _extract_frontmatter(content: str) -> str | None: lines = content.splitlines() if not lines or lines[0].strip() != "---": return None @@ -26,13 +25,13 @@ def _extract_frontmatter(content: str) -> Optional[str]: return None -def _parse_simple_frontmatter(frontmatter_text: str) -> Optional[dict[str, str]]: +def _parse_simple_frontmatter(frontmatter_text: str) -> dict[str, str] | None: """ Minimal fallback parser used when PyYAML is unavailable. Supports simple `key: value` mappings used by SKILL.md frontmatter. """ parsed: dict[str, str] = {} - current_key: Optional[str] = None + current_key: str | None = None for raw_line in frontmatter_text.splitlines(): stripped = raw_line.strip() if not stripped or stripped.startswith("#"): @@ -43,9 +42,7 @@ def _parse_simple_frontmatter(frontmatter_text: str) -> Optional[dict[str, str]] if current_key is None: return None current_value = parsed[current_key] - parsed[current_key] = ( - f"{current_value}\n{stripped}" if current_value else stripped - ) + parsed[current_key] = f"{current_value}\n{stripped}" if current_value else stripped continue if ":" not in stripped: @@ -55,9 +52,7 @@ def _parse_simple_frontmatter(frontmatter_text: str) -> Optional[dict[str, str]] value = value.strip() if not key: return None - if (value.startswith('"') and value.endswith('"')) or ( - value.startswith("'") and value.endswith("'") - ): + if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")): value = value[1:-1] parsed[key] = value current_key = key @@ -129,8 +124,7 @@ def validate_skill(skill_path): if len(name) > MAX_SKILL_NAME_LENGTH: return ( False, - f"Name is too long ({len(name)} characters). " - f"Maximum is {MAX_SKILL_NAME_LENGTH} characters.", + f"Name is too long ({len(name)} characters). Maximum is {MAX_SKILL_NAME_LENGTH} characters.", ) description = frontmatter.get("description", "") From a626c42fcd03744db962b12a9c3d9ee80f9990c7 Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Sun, 15 Mar 2026 09:48:08 +0530 Subject: [PATCH 11/16] fix(android): theme popup surfaces --- CHANGELOG.md | 1 + .../java/ai/openclaw/app/ui/ConnectTabScreen.kt | 14 +++++++++++--- .../java/ai/openclaw/app/ui/OnboardingFlow.kt | 15 ++++++++++++--- .../java/ai/openclaw/app/ui/chat/ChatComposer.kt | 10 +++++++++- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7e204965d7..b1f88842755 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ Docs: https://docs.openclaw.ai - Configure/startup: move outbound send-deps resolution into a lightweight helper so `openclaw configure` no longer stalls after the banner while eagerly loading channel plugins. (#46301) thanks @scoootscooob. - Zalo Personal/group gating: stop reapplying `dmPolicy.allowFrom` as a sender gate for already-allowlisted groups when `groupAllowFrom` is unset, so any member of an allowed group can trigger replies while DMs stay restricted. (#40146) - Plugins/install precedence: keep bundled plugins ahead of auto-discovered globals by default, but let an explicitly installed plugin record win its own duplicate-id tie so installed channel plugins load from `~/.openclaw/extensions` after `openclaw plugins install`. +- Android/chat: theme the thinking dropdown and TLS trust dialogs explicitly so popup surfaces match the active app theme instead of falling back to mismatched Material defaults. ### Fixes diff --git a/apps/android/app/src/main/java/ai/openclaw/app/ui/ConnectTabScreen.kt b/apps/android/app/src/main/java/ai/openclaw/app/ui/ConnectTabScreen.kt index 0f5a2c7d08e..9ca0ad3f47f 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/ui/ConnectTabScreen.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/ui/ConnectTabScreen.kt @@ -92,20 +92,28 @@ fun ConnectTabScreen(viewModel: MainViewModel) { val prompt = pendingTrust!! AlertDialog( onDismissRequest = { viewModel.declineGatewayTrustPrompt() }, - title = { Text("Trust this gateway?") }, + containerColor = mobileCardSurface, + title = { Text("Trust this gateway?", style = mobileHeadline, color = mobileText) }, text = { Text( "First-time TLS connection.\n\nVerify this SHA-256 fingerprint before trusting:\n${prompt.fingerprintSha256}", style = mobileCallout, + color = mobileText, ) }, confirmButton = { - TextButton(onClick = { viewModel.acceptGatewayTrustPrompt() }) { + TextButton( + onClick = { viewModel.acceptGatewayTrustPrompt() }, + colors = ButtonDefaults.textButtonColors(contentColor = mobileAccent), + ) { Text("Trust and continue") } }, dismissButton = { - TextButton(onClick = { viewModel.declineGatewayTrustPrompt() }) { + TextButton( + onClick = { viewModel.declineGatewayTrustPrompt() }, + colors = ButtonDefaults.textButtonColors(contentColor = mobileTextSecondary), + ) { Text("Cancel") } }, diff --git a/apps/android/app/src/main/java/ai/openclaw/app/ui/OnboardingFlow.kt b/apps/android/app/src/main/java/ai/openclaw/app/ui/OnboardingFlow.kt index 8c4df5beed5..28487439c0b 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/ui/OnboardingFlow.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/ui/OnboardingFlow.kt @@ -455,19 +455,28 @@ fun OnboardingFlow(viewModel: MainViewModel, modifier: Modifier = Modifier) { val prompt = pendingTrust!! AlertDialog( onDismissRequest = { viewModel.declineGatewayTrustPrompt() }, - title = { Text("Trust this gateway?") }, + containerColor = onboardingSurface, + title = { Text("Trust this gateway?", style = onboardingHeadlineStyle, color = onboardingText) }, text = { Text( "First-time TLS connection.\n\nVerify this SHA-256 fingerprint before trusting:\n${prompt.fingerprintSha256}", + style = onboardingCalloutStyle, + color = onboardingText, ) }, confirmButton = { - TextButton(onClick = { viewModel.acceptGatewayTrustPrompt() }) { + TextButton( + onClick = { viewModel.acceptGatewayTrustPrompt() }, + colors = ButtonDefaults.textButtonColors(contentColor = onboardingAccent), + ) { Text("Trust and continue") } }, dismissButton = { - TextButton(onClick = { viewModel.declineGatewayTrustPrompt() }) { + TextButton( + onClick = { viewModel.declineGatewayTrustPrompt() }, + colors = ButtonDefaults.textButtonColors(contentColor = onboardingTextSecondary), + ) { Text("Cancel") } }, diff --git a/apps/android/app/src/main/java/ai/openclaw/app/ui/chat/ChatComposer.kt b/apps/android/app/src/main/java/ai/openclaw/app/ui/chat/ChatComposer.kt index f641486794b..1adcc34c2d6 100644 --- a/apps/android/app/src/main/java/ai/openclaw/app/ui/chat/ChatComposer.kt +++ b/apps/android/app/src/main/java/ai/openclaw/app/ui/chat/ChatComposer.kt @@ -128,7 +128,15 @@ fun ChatComposer( } } - DropdownMenu(expanded = showThinkingMenu, onDismissRequest = { showThinkingMenu = false }) { + DropdownMenu( + expanded = showThinkingMenu, + onDismissRequest = { showThinkingMenu = false }, + shape = RoundedCornerShape(16.dp), + containerColor = mobileCardSurface, + tonalElevation = 0.dp, + shadowElevation = 8.dp, + border = BorderStroke(1.dp, mobileBorder), + ) { ThinkingMenuItem("off", thinkingLevel, onSetThinkingLevel) { showThinkingMenu = false } ThinkingMenuItem("low", thinkingLevel, onSetThinkingLevel) { showThinkingMenu = false } ThinkingMenuItem("medium", thinkingLevel, onSetThinkingLevel) { showThinkingMenu = false } From 10c7872856fa04da925594b07959fa3240b88f24 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sun, 15 Mar 2026 00:40:12 -0400 Subject: [PATCH 12/16] Auto-reply: fix followup media prompt rebuild --- src/auto-reply/reply/followup-media.ts | 55 +++++-- src/auto-reply/reply/followup-runner.test.ts | 143 +++++++++++++++++++ 2 files changed, 183 insertions(+), 15 deletions(-) diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts index 61d3b9f9cfb..d862a77875b 100644 --- a/src/auto-reply/reply/followup-media.ts +++ b/src/auto-reply/reply/followup-media.ts @@ -55,6 +55,30 @@ function replaceLastOccurrence( return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; } +function findFirstOccurrenceBeforeFileBlocks(value: string, search: string): number { + if (!search) { + return -1; + } + const fileBlockIndex = value.search(FILE_BLOCK_RE); + const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value; + return bodyRegion.indexOf(search); +} + +function replaceFirstOccurrenceBeforeFileBlocks( + value: string, + search: string, + replacement: string, +): string | undefined { + if (!search) { + return undefined; + } + const index = findFirstOccurrenceBeforeFileBlocks(value, search); + if (index < 0) { + return undefined; + } + return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; +} + function stripInlineDirectives(text: string | undefined): string { return parseInlineDirectives(text ?? "").cleaned.trim(); } @@ -92,20 +116,29 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: { stripped = stripLeadingMediaReplyHint(stripped); } + const replacementTargets = [ + params.originalBody?.trim(), + stripInlineDirectives(params.originalBody), + MEDIA_ONLY_PLACEHOLDER, + ].filter( + (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index, + ); + // Strip pre-existing file blocks from the body region when the updated body // contains new file blocks. Mixed messages (audio + PDF) can arrive with // file extraction already applied in the primary path; without this strip // the old block stays in the prompt while the updated body adds a new one, // duplicating potentially large file payloads. - // Scope stripping to the body segment so quoted/replied text and thread - // history above the body retain any legitimate blocks. + // Scope stripping to the confirmed body segment so quoted/replied text, + // thread history above the body, and prompts whose original body no longer + // appears all retain any legitimate blocks. if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) { - const bodyTarget = params.originalBody?.trim(); - const bodyIdx = bodyTarget ? stripped.lastIndexOf(bodyTarget) : -1; + const bodyIdx = + replacementTargets + .map((target) => findFirstOccurrenceBeforeFileBlocks(stripped, target)) + .find((index) => index >= 0) ?? -1; if (bodyIdx >= 0) { stripped = stripped.slice(0, bodyIdx) + stripExistingFileBlocks(stripped.slice(bodyIdx)); - } else { - stripped = stripExistingFileBlocks(stripped); } } @@ -117,17 +150,9 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: { return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim(); } - const replacementTargets = [ - params.originalBody?.trim(), - stripInlineDirectives(params.originalBody), - MEDIA_ONLY_PLACEHOLDER, - ].filter( - (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index, - ); - let rebuilt = stripped; for (const target of replacementTargets) { - const replaced = replaceLastOccurrence(rebuilt, target, updatedBody); + const replaced = replaceFirstOccurrenceBeforeFileBlocks(rebuilt, target, updatedBody); if (replaced !== undefined) { rebuilt = replaced; return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index bae9fa98391..d06442fad5c 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -1821,6 +1821,149 @@ describe("createFollowupRunner media understanding", () => { expect(agentCall?.prompt).not.toContain("old extracted content"); }); + it("preserves unrelated file blocks when the original body is absent from the prompt", async () => { + const quotedFileBlock = + '\nquoted thread attachment\n'; + const existingFileBlock = + '\nold extracted content\n'; + const newFileBlock = + '\nnew extracted content\n'; + const transcriptText = "Transcript from deferred audio"; + + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this\n\n${newFileBlock}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nQuoted thread above\n\n${quotedFileBlock}`, + mediaContext: { + Body: `summarize this\n\n${existingFileBlock}`, + CommandBody: "summarize this", + RawBody: "summarize this", + MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"], + MediaTypes: ["audio/ogg", "application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("Quoted thread above"); + expect(agentCall?.prompt).toContain(quotedFileBlock); + expect(agentCall?.prompt).toContain(newFileBlock); + expect(agentCall?.prompt?.match(/ { + const existingFileBlock = + '\nsummary notes:\nsummarize this\n'; + const transcriptText = "Transcript from deferred audio"; + + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${existingFileBlock}`, + mediaContext: { + Body: `summarize this\n\n${existingFileBlock}`, + CommandBody: "summarize this", + RawBody: "summarize this", + MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"], + MediaTypes: ["audio/ogg", "application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + const transcriptBlock = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this`; + expect(agentCall?.prompt).toContain(existingFileBlock); + expect(agentCall?.prompt).toContain(transcriptBlock); + expect(agentCall?.prompt?.indexOf(transcriptBlock)).toBeGreaterThan(-1); + expect(agentCall?.prompt?.indexOf(transcriptBlock)).toBeLessThan( + agentCall?.prompt?.indexOf(existingFileBlock) ?? -1, + ); + }); + it("sets DeferredMediaApplied when media understanding throws", async () => { applyMediaUnderstandingMock.mockRejectedValueOnce( new Error("transcription service unavailable"), From b853820eb229ce927acef66407e45c6e48e7565d Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sun, 15 Mar 2026 12:12:12 -0400 Subject: [PATCH 13/16] fix: address Codex and Greptile review comments on #46454 Replace regex-based file extraction detection (FILE_BLOCK_RE.test) with a DeferredFileBlocksExtracted mutation marker on FollowupMediaContext. The old approach scanned user body text for ' Date: Sun, 15 Mar 2026 15:51:14 -0400 Subject: [PATCH 14/16] fix: use file-block-safe replacement in normalizeUpdatedBody and trailing fallback (#46454) --- src/auto-reply/reply/followup-media.test.ts | 107 ++++++++++++++++++++ src/auto-reply/reply/followup-media.ts | 44 +++++++- 2 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 src/auto-reply/reply/followup-media.test.ts diff --git a/src/auto-reply/reply/followup-media.test.ts b/src/auto-reply/reply/followup-media.test.ts new file mode 100644 index 00000000000..d8ba453f801 --- /dev/null +++ b/src/auto-reply/reply/followup-media.test.ts @@ -0,0 +1,107 @@ +import { describe, expect, it } from "vitest"; +import { + _findLastOccurrenceBeforeFileBlocks as findLastOccurrenceBeforeFileBlocks, + _normalizeUpdatedBody as normalizeUpdatedBody, + _rebuildQueuedPromptWithMediaUnderstanding as rebuildQueuedPromptWithMediaUnderstanding, +} from "./followup-media.js"; + +const FILE_BLOCK = '\nPDF content\n'; + +describe("findLastOccurrenceBeforeFileBlocks", () => { + it("returns -1 for empty search", () => { + expect(findLastOccurrenceBeforeFileBlocks("hello", "")).toBe(-1); + }); + + it("finds last occurrence in body region before file blocks", () => { + const value = `hello world hello\n${FILE_BLOCK}`; + // "hello" appears at 0 and 12 — both before the file block + expect(findLastOccurrenceBeforeFileBlocks(value, "hello")).toBe(12); + }); + + it("does not match inside file block content", () => { + const value = `some text\n${FILE_BLOCK}\nPDF content`; + // "PDF content" appears in the file block and after it, but the body region + // (before { + // When the search string contains a block, it can't appear in the + // body-only region, so the fallback searches the full value. + const bodyWithFile = `caption\n${FILE_BLOCK}`; + const value = `previous\n${bodyWithFile}\nlater\n${bodyWithFile}`; + // Should find the *last* (trailing) occurrence + const expected = value.lastIndexOf(bodyWithFile); + expect(findLastOccurrenceBeforeFileBlocks(value, bodyWithFile)).toBe(expected); + expect(expected).toBeGreaterThan(value.indexOf(bodyWithFile)); + }); + + it("returns index when no file blocks exist in value", () => { + expect(findLastOccurrenceBeforeFileBlocks("abc abc", "abc")).toBe(4); + }); +}); + +describe("normalizeUpdatedBody", () => { + it("returns empty string when updatedBody is empty", () => { + expect(normalizeUpdatedBody({ originalBody: "foo", updatedBody: "" })).toBe(""); + }); + + it("returns updatedBody when originalBody is empty", () => { + expect(normalizeUpdatedBody({ updatedBody: "hello" })).toBe("hello"); + }); + + it("strips directives when updatedBody equals originalBody", () => { + const body = "/think high tell me a joke"; + const result = normalizeUpdatedBody({ originalBody: body, updatedBody: body }); + expect(result).toBe("tell me a joke"); + }); + + it("does not corrupt file block content during directive cleanup", () => { + const originalBody = "/think high tell me about this file"; + // updatedBody has the original body plus a file block appended by media processing + const updatedBody = `${originalBody}\n${FILE_BLOCK}`; + const result = normalizeUpdatedBody({ originalBody, updatedBody }); + // The directive should be stripped from the body portion, file block preserved + expect(result).toContain("tell me about this file"); + expect(result).toContain(FILE_BLOCK); + expect(result).not.toContain("/think"); + }); + + it("replaces in body region, not inside file blocks", () => { + const originalBody = "PDF content"; + const updatedBody = `PDF content\n\nPDF content\n`; + // The replacement should target the body region "PDF content" before the + // file block, not the "PDF content" inside the block. + const result = normalizeUpdatedBody({ originalBody, updatedBody }); + // With no directives to strip, original === cleaned, updatedBody !== originalBody + // because updatedBody has the file block appended. The replacement targets the + // body-region occurrence. + expect(result).toContain('"); + }); +}); + +describe("rebuildQueuedPromptWithMediaUnderstanding", () => { + it("replaces original body with updated body in prompt", () => { + const result = rebuildQueuedPromptWithMediaUnderstanding({ + prompt: "thread context\nhello world", + originalBody: "hello world", + updatedBody: 'hello world\ndata', + }); + expect(result).toContain('data'); + expect(result).toContain("thread context"); + }); + + it("preserves file blocks in thread history when body is replaced", () => { + const prompt = `history\nold\nhello world`; + const result = rebuildQueuedPromptWithMediaUnderstanding({ + prompt, + originalBody: "hello world", + updatedBody: "hello world transcribed", + }); + // The old file block from history should be preserved since updatedBody + // has no file blocks of its own. + expect(result).toContain('old'); + expect(result).toContain("hello world transcribed"); + }); +}); diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts index d9202554526..f40ec91f109 100644 --- a/src/auto-reply/reply/followup-media.ts +++ b/src/auto-reply/reply/followup-media.ts @@ -64,6 +64,40 @@ function findFirstOccurrenceBeforeFileBlocks(value: string, search: string): num return bodyRegion.indexOf(search); } +function findLastOccurrenceBeforeFileBlocks(value: string, search: string): number { + if (!search) { + return -1; + } + const fileBlockIndex = value.search(FILE_BLOCK_RE); + const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value; + const index = bodyRegion.lastIndexOf(search); + if (index >= 0) { + return index; + } + // Fallback: search string itself contains file blocks — it can't appear in the + // body-only region. Search the full value with lastIndexOf to pick the trailing + // (most recent) occurrence when thread/history has identical bodies. + if (FILE_BLOCK_RE.test(search)) { + return value.lastIndexOf(search); + } + return -1; +} + +function replaceLastOccurrenceBeforeFileBlocks( + value: string, + search: string, + replacement: string, +): string | undefined { + if (!search) { + return undefined; + } + const index = findLastOccurrenceBeforeFileBlocks(value, search); + if (index < 0) { + return undefined; + } + return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; +} + function replaceFirstOccurrenceBeforeFileBlocks( value: string, search: string, @@ -101,7 +135,8 @@ function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: str return cleanedOriginalBody; } return ( - replaceLastOccurrence(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody + replaceLastOccurrenceBeforeFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ?? + updatedBody ).trim(); } @@ -215,6 +250,13 @@ function snapshotUpdatedMediaContext(params: { }; } +// Exported for unit testing — these are pure string helpers with no side effects. +export { + findLastOccurrenceBeforeFileBlocks as _findLastOccurrenceBeforeFileBlocks, + normalizeUpdatedBody as _normalizeUpdatedBody, + rebuildQueuedPromptWithMediaUnderstanding as _rebuildQueuedPromptWithMediaUnderstanding, +}; + export async function applyDeferredMediaUnderstandingToQueuedRun( queued: FollowupRun, params: { logLabel?: string } = {}, From 4092f82615ccdcad2a90c95ee3418f4abc3642b9 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sun, 15 Mar 2026 17:34:56 -0400 Subject: [PATCH 15/16] fix: trailing body match and RawBody-missing extraction detection (#46454) --- src/auto-reply/reply/followup-media.ts | 171 ++++++++++++------- src/auto-reply/reply/followup-runner.test.ts | 158 +++++++++++++++++ src/auto-reply/reply/queue/drain.ts | 9 +- 3 files changed, 272 insertions(+), 66 deletions(-) diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts index f40ec91f109..425bdd601ea 100644 --- a/src/auto-reply/reply/followup-media.ts +++ b/src/auto-reply/reply/followup-media.ts @@ -1,3 +1,4 @@ +import path from "node:path"; import { logVerbose } from "../../globals.js"; import { applyMediaUnderstanding } from "../../media-understanding/apply.js"; import { @@ -40,30 +41,6 @@ function stripLeadingMediaReplyHint(prompt: string): string { return prompt.trim(); } -function replaceLastOccurrence( - value: string, - search: string, - replacement: string, -): string | undefined { - if (!search) { - return undefined; - } - const index = value.lastIndexOf(search); - if (index < 0) { - return undefined; - } - return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; -} - -function findFirstOccurrenceBeforeFileBlocks(value: string, search: string): number { - if (!search) { - return -1; - } - const fileBlockIndex = value.search(FILE_BLOCK_RE); - const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value; - return bodyRegion.indexOf(search); -} - function findLastOccurrenceBeforeFileBlocks(value: string, search: string): number { if (!search) { return -1; @@ -98,21 +75,81 @@ function replaceLastOccurrenceBeforeFileBlocks( return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; } -function replaceFirstOccurrenceBeforeFileBlocks( +function findTrailingReplacementTargetBeforeFileBlocks( + value: string, + targets: string[], +): { index: number; target: string } | undefined { + let bestMatch: { index: number; target: string } | undefined; + for (const target of targets) { + const index = findLastOccurrenceBeforeFileBlocks(value, target); + if (index < 0) { + continue; + } + if (!bestMatch || index > bestMatch.index) { + bestMatch = { index, target }; + } + } + return bestMatch; +} + +function replaceOccurrenceAtIndex( value: string, search: string, replacement: string, -): string | undefined { - if (!search) { - return undefined; - } - const index = findFirstOccurrenceBeforeFileBlocks(value, search); - if (index < 0) { - return undefined; - } + index: number, +): string { return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`; } +function decodeXmlAttr(value: string): string { + return value + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/&/g, "&"); +} + +function extractAttachmentFileName(value?: string): string | undefined { + const trimmed = value?.trim(); + if (!trimmed) { + return undefined; + } + if (/^[a-zA-Z][a-zA-Z\d+.-]*:/.test(trimmed)) { + try { + const pathname = new URL(trimmed).pathname; + const basename = path.posix.basename(pathname); + return basename || undefined; + } catch { + // Fall back to path-style parsing below. + } + } + const normalized = trimmed.replace(/\\/g, "/"); + const basename = path.posix.basename(normalized); + return basename || undefined; +} + +function bodyContainsMatchingFileBlock(mediaContext: FollowupMediaContext): boolean { + const body = mediaContext.Body?.trim(); + if (!body || !FILE_BLOCK_RE.test(body)) { + return false; + } + const bodyFileNames = new Set(); + for (const match of body.matchAll(/]*>/gi)) { + const fileName = match[1]?.trim(); + if (fileName) { + bodyFileNames.add(decodeXmlAttr(fileName)); + } + } + if (bodyFileNames.size === 0) { + return false; + } + return normalizeAttachments(mediaContext as MsgContext).some((attachment) => { + const fileName = extractAttachmentFileName(attachment.path ?? attachment.url); + return Boolean(fileName && bodyFileNames.has(fileName)); + }); +} + function stripInlineDirectives(text: string | undefined): string { return parseInlineDirectives(text ?? "").cleaned.trim(); } @@ -168,12 +205,14 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: { // thread history above the body, and prompts whose original body no longer // appears all retain any legitimate blocks. if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) { - const bodyIdx = - replacementTargets - .map((target) => findFirstOccurrenceBeforeFileBlocks(stripped, target)) - .find((index) => index >= 0) ?? -1; - if (bodyIdx >= 0) { - stripped = stripped.slice(0, bodyIdx) + stripExistingFileBlocks(stripped.slice(bodyIdx)); + const trailingMatch = findTrailingReplacementTargetBeforeFileBlocks( + stripped, + replacementTargets, + ); + if (trailingMatch) { + stripped = + stripped.slice(0, trailingMatch.index) + + stripExistingFileBlocks(stripped.slice(trailingMatch.index)); } } @@ -186,12 +225,15 @@ function rebuildQueuedPromptWithMediaUnderstanding(params: { } let rebuilt = stripped; - for (const target of replacementTargets) { - const replaced = replaceFirstOccurrenceBeforeFileBlocks(rebuilt, target, updatedBody); - if (replaced !== undefined) { - rebuilt = replaced; - return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); - } + const trailingMatch = findTrailingReplacementTargetBeforeFileBlocks(rebuilt, replacementTargets); + if (trailingMatch) { + rebuilt = replaceOccurrenceAtIndex( + rebuilt, + trailingMatch.target, + updatedBody, + trailingMatch.index, + ); + return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim(); } rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n"); @@ -268,29 +310,29 @@ export async function applyDeferredMediaUnderstandingToQueuedRun( if (!mediaContext || mediaContext.DeferredMediaApplied) { return; } - if (mediaContext.MediaUnderstanding?.length) { - mediaContext.DeferredMediaApplied = true; - return; - } if (!hasMediaAttachments(mediaContext)) { mediaContext.DeferredMediaApplied = true; return; } - - const resolvedOriginalBody = - mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body; - // Detect file extraction from the primary path via body mutation instead of - // scanning for literal ' { }; expect(agentCall?.prompt?.match(/ { + const fileBlock = '\nreport content\n'; + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${fileBlock}`, + mediaContext: { + Body: `summarize this\n\n${fileBlock}`, + CommandBody: "summarize this", + MediaPaths: ["/tmp/report.pdf"], + MediaTypes: ["application/pdf"], + }, + }), + ); + + expect(applyMediaUnderstandingMock).not.toHaveBeenCalled(); + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain(fileBlock); + }); + + it("replaces the trailing repeated body segment instead of the first matching thread text", async () => { + const existingFileBlock = + '\nold extracted content\n'; + const newFileBlock = + '\nnew extracted content\n'; + const transcriptText = "Deferred transcript"; + + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this\n\n${newFileBlock}`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: true, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + await runner( + createQueuedRun({ + prompt: + `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nThread history: summarize this\n\n` + + `summarize this\n\n${existingFileBlock}`, + mediaContext: { + Body: `summarize this\n\n${existingFileBlock}`, + CommandBody: "summarize this", + RawBody: "summarize this", + MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"], + MediaTypes: ["audio/ogg", "application/pdf"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + expect(agentCall?.prompt).toContain("Thread history: summarize this"); + expect(agentCall?.prompt).toContain(transcriptText); + expect(agentCall?.prompt).toContain(newFileBlock); + expect(agentCall?.prompt).not.toContain("old extracted content"); + expect(agentCall?.prompt?.indexOf(newFileBlock)).toBeGreaterThan( + agentCall?.prompt?.lastIndexOf("summarize this") ?? -1, + ); + }); }); describe("followup queue drain deferred media understanding", () => { @@ -2102,6 +2210,56 @@ describe("followup queue drain deferred media understanding", () => { expect(prompt).not.toContain("[media attached: /tmp/voice.ogg"); }); + it("preprocesses queued runs in parallel", async () => { + const resolvers: Array<() => void> = []; + const done = () => ({ + outputs: [], + decisions: [], + appliedImage: false, + appliedAudio: false, + appliedVideo: false, + appliedFile: false, + }); + applyMediaUnderstandingMock.mockImplementation( + async () => + await new Promise((resolve) => { + resolvers.push(() => resolve(done())); + }), + ); + + const items: FollowupRun[] = [ + createQueuedRun({ + prompt: "[media attached: /tmp/voice-1.ogg (audio/ogg)]\nfirst text", + summaryLine: "first text", + run: { messageProvider: "telegram" }, + mediaContext: { + Body: "first text", + MediaPaths: ["/tmp/voice-1.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + createQueuedRun({ + prompt: "[media attached: /tmp/voice-2.ogg (audio/ogg)]\nsecond text", + summaryLine: "second text", + run: { messageProvider: "telegram" }, + mediaContext: { + Body: "second text", + MediaPaths: ["/tmp/voice-2.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + ]; + + const pending = applyDeferredMediaToQueuedRuns(items); + + expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(2); + + for (const resolve of resolvers) { + resolve(); + } + await pending; + }); + it("preprocesses dropped media items before building overflow summaries", async () => { applyMediaUnderstandingMock.mockImplementationOnce( async (params: { ctx: Record }) => { diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts index 471c94200ba..5d53df34189 100644 --- a/src/auto-reply/reply/queue/drain.ts +++ b/src/auto-reply/reply/queue/drain.ts @@ -76,9 +76,12 @@ function clearFollowupQueueSummaryState(queue: FollowupQueueState): void { } export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Promise { - for (const item of items) { - await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }); - } + await Promise.allSettled( + items.map( + async (item) => + await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }), + ), + ); } async function resolveSummaryLines(items: FollowupRun[]): Promise { From 42b1271c85489942819476245c767c3033fdce25 Mon Sep 17 00:00:00 2001 From: Joey Krug Date: Sun, 15 Mar 2026 18:00:45 -0400 Subject: [PATCH 16/16] fix: parallelize deferred media calls and search full prompt outside file blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parallelize deferred media understanding calls in resolveSummaryLines and applyDeferredMediaToQueuedRuns using Promise.allSettled so media API calls run concurrently while summary line order stays sequential. Replace findFirstOccurrenceBeforeFileBlocks (which truncated at the first tag) with findLastOccurrenceOutsideFileBlocks that searches the full prompt via lastIndexOf and skips matches inside blocks. This fixes body replacement when thread/history context has extracted file blocks before the current queued message body. Add regression test for body appearing after thread-history file blocks. --- src/auto-reply/reply/followup-media.test.ts | 40 +++++++---- src/auto-reply/reply/followup-media.ts | 58 +++++++++++----- src/auto-reply/reply/followup-runner.test.ts | 72 ++++++++++++++++++++ src/auto-reply/reply/queue/drain.ts | 21 +++--- 4 files changed, 152 insertions(+), 39 deletions(-) diff --git a/src/auto-reply/reply/followup-media.test.ts b/src/auto-reply/reply/followup-media.test.ts index d8ba453f801..77996f85606 100644 --- a/src/auto-reply/reply/followup-media.test.ts +++ b/src/auto-reply/reply/followup-media.test.ts @@ -1,43 +1,57 @@ import { describe, expect, it } from "vitest"; import { - _findLastOccurrenceBeforeFileBlocks as findLastOccurrenceBeforeFileBlocks, + _findLastOccurrenceOutsideFileBlocks as findLastOccurrenceOutsideFileBlocks, _normalizeUpdatedBody as normalizeUpdatedBody, _rebuildQueuedPromptWithMediaUnderstanding as rebuildQueuedPromptWithMediaUnderstanding, } from "./followup-media.js"; const FILE_BLOCK = '\nPDF content\n'; -describe("findLastOccurrenceBeforeFileBlocks", () => { +describe("findLastOccurrenceOutsideFileBlocks", () => { it("returns -1 for empty search", () => { - expect(findLastOccurrenceBeforeFileBlocks("hello", "")).toBe(-1); + expect(findLastOccurrenceOutsideFileBlocks("hello", "")).toBe(-1); }); it("finds last occurrence in body region before file blocks", () => { const value = `hello world hello\n${FILE_BLOCK}`; // "hello" appears at 0 and 12 — both before the file block - expect(findLastOccurrenceBeforeFileBlocks(value, "hello")).toBe(12); + expect(findLastOccurrenceOutsideFileBlocks(value, "hello")).toBe(12); }); - it("does not match inside file block content", () => { + it("skips matches inside file block content", () => { + // "PDF content" appears only inside the file block — no valid match outside. + const value = `some text\n${FILE_BLOCK}`; + expect(findLastOccurrenceOutsideFileBlocks(value, "PDF content")).toBe(-1); + }); + + it("finds trailing occurrence outside file block even when also inside one", () => { const value = `some text\n${FILE_BLOCK}\nPDF content`; - // "PDF content" appears in the file block and after it, but the body region - // (before { - // When the search string contains a block, it can't appear in the - // body-only region, so the fallback searches the full value. + it("finds occurrence when search itself contains file blocks", () => { const bodyWithFile = `caption\n${FILE_BLOCK}`; const value = `previous\n${bodyWithFile}\nlater\n${bodyWithFile}`; // Should find the *last* (trailing) occurrence const expected = value.lastIndexOf(bodyWithFile); - expect(findLastOccurrenceBeforeFileBlocks(value, bodyWithFile)).toBe(expected); + expect(findLastOccurrenceOutsideFileBlocks(value, bodyWithFile)).toBe(expected); expect(expected).toBeGreaterThan(value.indexOf(bodyWithFile)); }); it("returns index when no file blocks exist in value", () => { - expect(findLastOccurrenceBeforeFileBlocks("abc abc", "abc")).toBe(4); + expect(findLastOccurrenceOutsideFileBlocks("abc abc", "abc")).toBe(4); + }); + + it("finds body text after thread-history file blocks", () => { + const value = `Thread history\n${FILE_BLOCK}\n\ncheck this out`; + // The body "check this out" appears after a file block from thread history. + // The old truncation approach would miss this; the new approach finds it. + expect(findLastOccurrenceOutsideFileBlocks(value, "check this out")).toBe( + value.lastIndexOf("check this out"), + ); }); }); diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts index 425bdd601ea..f0d5d951683 100644 --- a/src/auto-reply/reply/followup-media.ts +++ b/src/auto-reply/reply/followup-media.ts @@ -41,26 +41,48 @@ function stripLeadingMediaReplyHint(prompt: string): string { return prompt.trim(); } -function findLastOccurrenceBeforeFileBlocks(value: string, search: string): number { +/** Collect the [start, end) ranges of every `` block in `value`. */ +function collectFileBlockRanges(value: string): Array<[number, number]> { + const ranges: Array<[number, number]> = []; + const re = new RegExp(FILE_BLOCK_FULL_RE.source, FILE_BLOCK_FULL_RE.flags); + let m: RegExpExecArray | null; + while ((m = re.exec(value)) !== null) { + ranges.push([m.index, m.index + m[0].length]); + } + return ranges; +} + +function isInsideFileBlock( + position: number, + length: number, + ranges: Array<[number, number]>, +): boolean { + for (const [start, end] of ranges) { + if (position >= start && position + length <= end) { + return true; + } + } + return false; +} + +/** + * Find the last occurrence of `search` in `value` that is NOT inside a + * `` block. Searches the full string with lastIndexOf, + * then walks backward past any matches that fall inside file blocks. + */ +function findLastOccurrenceOutsideFileBlocks(value: string, search: string): number { if (!search) { return -1; } - const fileBlockIndex = value.search(FILE_BLOCK_RE); - const bodyRegion = fileBlockIndex >= 0 ? value.slice(0, fileBlockIndex) : value; - const index = bodyRegion.lastIndexOf(search); - if (index >= 0) { - return index; + const ranges = collectFileBlockRanges(value); + let pos = value.lastIndexOf(search); + while (pos >= 0 && isInsideFileBlock(pos, search.length, ranges)) { + pos = value.lastIndexOf(search, pos - 1); } - // Fallback: search string itself contains file blocks — it can't appear in the - // body-only region. Search the full value with lastIndexOf to pick the trailing - // (most recent) occurrence when thread/history has identical bodies. - if (FILE_BLOCK_RE.test(search)) { - return value.lastIndexOf(search); - } - return -1; + return pos; } -function replaceLastOccurrenceBeforeFileBlocks( +function replaceLastOccurrenceOutsideFileBlocks( value: string, search: string, replacement: string, @@ -68,7 +90,7 @@ function replaceLastOccurrenceBeforeFileBlocks( if (!search) { return undefined; } - const index = findLastOccurrenceBeforeFileBlocks(value, search); + const index = findLastOccurrenceOutsideFileBlocks(value, search); if (index < 0) { return undefined; } @@ -81,7 +103,7 @@ function findTrailingReplacementTargetBeforeFileBlocks( ): { index: number; target: string } | undefined { let bestMatch: { index: number; target: string } | undefined; for (const target of targets) { - const index = findLastOccurrenceBeforeFileBlocks(value, target); + const index = findLastOccurrenceOutsideFileBlocks(value, target); if (index < 0) { continue; } @@ -172,7 +194,7 @@ function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: str return cleanedOriginalBody; } return ( - replaceLastOccurrenceBeforeFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ?? + replaceLastOccurrenceOutsideFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ?? updatedBody ).trim(); } @@ -294,7 +316,7 @@ function snapshotUpdatedMediaContext(params: { // Exported for unit testing — these are pure string helpers with no side effects. export { - findLastOccurrenceBeforeFileBlocks as _findLastOccurrenceBeforeFileBlocks, + findLastOccurrenceOutsideFileBlocks as _findLastOccurrenceOutsideFileBlocks, normalizeUpdatedBody as _normalizeUpdatedBody, rebuildQueuedPromptWithMediaUnderstanding as _rebuildQueuedPromptWithMediaUnderstanding, }; diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts index 045d8ad0658..ed076b4da59 100644 --- a/src/auto-reply/reply/followup-runner.test.ts +++ b/src/auto-reply/reply/followup-runner.test.ts @@ -1964,6 +1964,78 @@ describe("createFollowupRunner media understanding", () => { ); }); + it("finds the body after thread-history file blocks when body appears after the first tag", async () => { + const threadFileBlock = + '\nolder thread attachment\n'; + const transcriptText = "Transcript from deferred voice note"; + + applyMediaUnderstandingMock.mockImplementationOnce( + async (params: { ctx: Record }) => { + params.ctx.MediaUnderstanding = [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ]; + params.ctx.Transcript = transcriptText; + params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\ncheck this out`; + return { + outputs: [ + { + kind: "audio.transcription", + text: transcriptText, + attachmentIndex: 0, + provider: "whisper", + }, + ], + decisions: [], + appliedImage: false, + appliedAudio: true, + appliedVideo: false, + appliedFile: false, + }; + }, + ); + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "processed" }], + meta: {}, + }); + + const runner = createFollowupRunner({ + opts: { onBlockReply: vi.fn(async () => {}) }, + typing: createMockTypingController(), + typingMode: "instant", + defaultModel: "anthropic/claude-opus-4-5", + }); + + // The prompt has thread history with a file block BEFORE the current + // queued body text. The old truncation approach would miss the body + // because it only searched before the first tag. + await runner( + createQueuedRun({ + prompt: `[media attached: /tmp/voice.ogg]\n${MEDIA_REPLY_HINT}\nThread history\n\n${threadFileBlock}\n\ncheck this out`, + mediaContext: { + Body: "check this out", + RawBody: "check this out", + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }, + }), + ); + + const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { + prompt?: string; + }; + const transcriptBlock = `[Audio]\nTranscript:\n${transcriptText}\n\ncheck this out`; + // The body should be replaced with the transcript block + expect(agentCall?.prompt).toContain(transcriptBlock); + // Thread history and its file block should be preserved + expect(agentCall?.prompt).toContain("Thread history"); + expect(agentCall?.prompt).toContain(threadFileBlock); + }); + it("sets DeferredMediaApplied when media understanding throws", async () => { applyMediaUnderstandingMock.mockRejectedValueOnce( new Error("transcription service unavailable"), diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts index 5d53df34189..41c859d2ec3 100644 --- a/src/auto-reply/reply/queue/drain.ts +++ b/src/auto-reply/reply/queue/drain.ts @@ -85,14 +85,19 @@ export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Prom } async function resolveSummaryLines(items: FollowupRun[]): Promise { - const summaryLines: string[] = []; - for (const item of items) { - await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }); - // After deferred media, prefer the updated prompt (which includes transcripts) - // over the original summaryLine (which may just be the caption text). - summaryLines.push(buildQueueSummaryLine(item.prompt.trim() || item.summaryLine?.trim() || "")); - } - return summaryLines; + // Parallelize the media understanding API calls upfront (same pattern as + // applyDeferredMediaToQueuedRuns), then build summary lines sequentially + // so line order matches the original item order. + await Promise.allSettled( + items.map((item) => + applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }), + ), + ); + // After deferred media, prefer the updated prompt (which includes transcripts) + // over the original summaryLine (which may just be the caption text). + return items.map((item) => + buildQueueSummaryLine(item.prompt.trim() || item.summaryLine?.trim() || ""), + ); } export async function buildMediaAwareQueueSummaryPrompt(params: {