diff --git a/scripts/check-composite-action-input-interpolation.py b/scripts/check-composite-action-input-interpolation.py
index 18c52501b85..5c852a7f323 100644
--- a/scripts/check-composite-action-input-interpolation.py
+++ b/scripts/check-composite-action-input-interpolation.py
@@ -5,7 +5,6 @@ import pathlib
 import re
 import sys
 
-
 INPUT_INTERPOLATION_RE = re.compile(r"\$\{\{\s*inputs\.")
 RUN_LINE_RE = re.compile(r"^(\s*)run:\s*(.*)$")
 USING_COMPOSITE_RE = re.compile(r"^\s*using:\s*composite\s*$", re.MULTILINE)
diff --git a/skills/model-usage/scripts/model_usage.py b/skills/model-usage/scripts/model_usage.py
index ea28fa0d983..7688e4e46b2 100644
--- a/skills/model-usage/scripts/model_usage.py
+++ b/skills/model-usage/scripts/model_usage.py
@@ -9,12 +9,12 @@ from __future__ import annotations
 
 import argparse
 import json
-import os
 import subprocess
 import sys
+from collections.abc import Iterable
 from dataclasses import dataclass
 from datetime import date, datetime, timedelta
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any
 
 
 def positive_int(value: str) -> int:
@@ -31,7 +31,7 @@ def eprint(msg: str) -> None:
     print(msg, file=sys.stderr)
 
 
-def run_codexbar_cost(provider: str) -> List[Dict[str, Any]]:
+def run_codexbar_cost(provider: str) -> list[dict[str, Any]]:
     cmd = ["codexbar", "cost", "--format", "json", "--provider", provider]
     try:
         output = subprocess.check_output(cmd, text=True)
@@ -48,12 +48,12 @@ def run_codexbar_cost(provider: str) -> List[Dict[str, Any]]:
     return payload
 
 
-def load_payload(input_path: Optional[str], provider: str) -> Dict[str, Any]:
+def load_payload(input_path: str | None, provider: str) -> dict[str, Any]:
     if input_path:
         if input_path == "-":
             raw = sys.stdin.read()
         else:
-            with open(input_path, "r", encoding="utf-8") as handle:
+            with open(input_path, encoding="utf-8") as handle:
                 raw = handle.read()
         data = json.loads(raw)
     else:
@@ -77,7 +77,7 @@ class ModelCost:
     cost: float
 
 
-def parse_daily_entries(payload: Dict[str, Any]) -> List[Dict[str, Any]]:
+def parse_daily_entries(payload: dict[str, Any]) -> list[dict[str, Any]]:
     daily = payload.get("daily")
     if not daily:
         return []
@@ -86,18 +86,18 @@ def parse_daily_entries(payload: Dict[str, Any]) -> List[Dict[str, Any]]:
     return [entry for entry in daily if isinstance(entry, dict)]
 
 
-def parse_date(value: str) -> Optional[date]:
+def parse_date(value: str) -> date | None:
     try:
         return datetime.strptime(value, "%Y-%m-%d").date()
     except Exception:
         return None
 
 
-def filter_by_days(entries: List[Dict[str, Any]], days: Optional[int]) -> List[Dict[str, Any]]:
+def filter_by_days(entries: list[dict[str, Any]], days: int | None) -> list[dict[str, Any]]:
     if not days:
         return entries
     cutoff = date.today() - timedelta(days=days - 1)
-    filtered: List[Dict[str, Any]] = []
+    filtered: list[dict[str, Any]] = []
     for entry in entries:
         day = entry.get("date")
         if not isinstance(day, str):
@@ -108,8 +108,8 @@ def filter_by_days(entries: List[Dict[str, Any]], days: Optional[int]) -> List[D
     return filtered
 
 
-def aggregate_costs(entries: Iterable[Dict[str, Any]]) -> Dict[str, float]:
-    totals: Dict[str, float] = {}
+def aggregate_costs(entries: Iterable[dict[str, Any]]) -> dict[str, float]:
+    totals: dict[str, float] = {}
     for entry in entries:
         breakdowns = entry.get("modelBreakdowns")
         if not breakdowns:
@@ -129,7 +129,7 @@ def aggregate_costs(entries: Iterable[Dict[str, Any]]) -> Dict[str, float]:
     return totals
 
 
-def pick_current_model(entries: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
+def pick_current_model(entries: list[dict[str, Any]]) -> tuple[str | None, str | None]:
     if not entries:
         return None, None
     sorted_entries = sorted(
@@ -139,7 +139,7 @@ def pick_current_model(entries: List[Dict[str, Any]]) -> Tuple[Optional[str], Op
     for entry in reversed(sorted_entries):
         breakdowns = entry.get("modelBreakdowns")
         if isinstance(breakdowns, list) and breakdowns:
-            scored: List[ModelCost] = []
+            scored: list[ModelCost] = []
             for item in breakdowns:
                 if not isinstance(item, dict):
                     continue
@@ -158,13 +158,13 @@ def pick_current_model(entries: List[Dict[str, Any]]) -> Tuple[Optional[str], Op
     return None, None
 
 
-def usd(value: Optional[float]) -> str:
+def usd(value: float | None) -> str:
     if value is None:
         return "—"
     return f"${value:,.2f}"
 
 
-def latest_day_cost(entries: List[Dict[str, Any]], model: str) -> Tuple[Optional[str], Optional[float]]:
+def latest_day_cost(entries: list[dict[str, Any]], model: str) -> tuple[str | None, float | None]:
     if not entries:
         return None, None
     sorted_entries = sorted(
@@ -188,10 +188,10 @@ def latest_day_cost(entries: List[Dict[str, Any]], model: str) -> Tuple[Optional
 def render_text_current(
     provider: str,
     model: str,
-    latest_date: Optional[str],
-    total_cost: Optional[float],
-    latest_cost: Optional[float],
-    latest_cost_date: Optional[str],
+    latest_date: str | None,
+    total_cost: float | None,
+    latest_cost: float | None,
+    latest_cost_date: str | None,
     entry_count: int,
 ) -> str:
     lines = [f"Provider: {provider}", f"Current model: {model}"]
@@ -204,7 +204,7 @@ def render_text_current(
     return "\n".join(lines)
 
 
-def render_text_all(provider: str, totals: Dict[str, float]) -> str:
+def render_text_all(provider: str, totals: dict[str, float]) -> str:
     lines = [f"Provider: {provider}", "Models:"]
     for model, cost in sorted(totals.items(), key=lambda item: item[1], reverse=True):
         lines.append(f"- {model}: {usd(cost)}")
@@ -214,12 +214,12 @@ def render_text_all(provider: str, totals: Dict[str, float]) -> str:
 def build_json_current(
     provider: str,
     model: str,
-    latest_date: Optional[str],
-    total_cost: Optional[float],
-    latest_cost: Optional[float],
-    latest_cost_date: Optional[str],
+    latest_date: str | None,
+    total_cost: float | None,
+    latest_cost: float | None,
+    latest_cost_date: str | None,
     entry_count: int,
-) -> Dict[str, Any]:
+) -> dict[str, Any]:
     return {
         "provider": provider,
         "mode": "current",
@@ -232,7 +232,7 @@ def build_json_current(
     }
 
 
-def build_json_all(provider: str, totals: Dict[str, float]) -> Dict[str, Any]:
+def build_json_all(provider: str, totals: dict[str, float]) -> dict[str, Any]:
     return {
         "provider": provider,
         "mode": "all",
diff --git a/skills/nano-banana-pro/scripts/generate_image.py b/skills/nano-banana-pro/scripts/generate_image.py
index 796022adfba..92efabc2b0f 100755
--- a/skills/nano-banana-pro/scripts/generate_image.py
+++ b/skills/nano-banana-pro/scripts/generate_image.py
@@ -70,42 +70,32 @@ def choose_output_resolution(
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Generate images using Nano Banana Pro (Gemini 3 Pro Image)"
-    )
+    parser = argparse.ArgumentParser(description="Generate images using Nano Banana Pro (Gemini 3 Pro Image)")
+    parser.add_argument("--prompt", "-p", required=True, help="Image description/prompt")
+    parser.add_argument("--filename", "-f", required=True, help="Output filename (e.g., sunset-mountains.png)")
     parser.add_argument(
-        "--prompt", "-p",
-        required=True,
-        help="Image description/prompt"
-    )
-    parser.add_argument(
-        "--filename", "-f",
-        required=True,
-        help="Output filename (e.g., sunset-mountains.png)"
-    )
-    parser.add_argument(
-        "--input-image", "-i",
+        "--input-image",
+        "-i",
         action="append",
         dest="input_images",
         metavar="IMAGE",
-        help="Input image path(s) for editing/composition. Can be specified multiple times (up to 14 images)."
+        help="Input image path(s) for editing/composition. Can be specified multiple times (up to 14 images).",
     )
     parser.add_argument(
-        "--resolution", "-r",
+        "--resolution",
+        "-r",
         choices=["1K", "2K", "4K"],
         default=None,
-        help="Output resolution: 1K, 2K, or 4K. If omitted with input images, auto-detect from largest image dimension."
+        help="Output resolution: 1K, 2K, or 4K. If omitted with input images, auto-detect from largest image dimension.",
     )
     parser.add_argument(
-        "--aspect-ratio", "-a",
+        "--aspect-ratio",
+        "-a",
         choices=SUPPORTED_ASPECT_RATIOS,
         default=None,
-        help=f"Output aspect ratio (default: model decides). Options: {', '.join(SUPPORTED_ASPECT_RATIOS)}"
-    )
-    parser.add_argument(
-        "--api-key", "-k",
-        help="Gemini API key (overrides GEMINI_API_KEY env var)"
+        help=f"Output aspect ratio (default: model decides). Options: {', '.join(SUPPORTED_ASPECT_RATIOS)}",
     )
+    parser.add_argument("--api-key", "-k", help="Gemini API key (overrides GEMINI_API_KEY env var)")
 
     args = parser.parse_args()
 
@@ -158,10 +148,7 @@ def main():
         has_input_images=bool(input_images),
     )
     if auto_detected:
-        print(
-            f"Auto-detected resolution: {output_resolution} "
-            f"(from max input dimension {max_input_dim})"
-        )
+        print(f"Auto-detected resolution: {output_resolution} (from max input dimension {max_input_dim})")
 
     # Build contents (images first if editing, prompt only if generating)
     if input_images:
@@ -182,9 +169,8 @@ def main():
             model="gemini-3-pro-image-preview",
             contents=contents,
             config=types.GenerateContentConfig(
-                response_modalities=["TEXT", "IMAGE"],
-                image_config=types.ImageConfig(**image_cfg_kwargs)
-            )
+                response_modalities=["TEXT", "IMAGE"], image_config=types.ImageConfig(**image_cfg_kwargs)
+            ),
         )
 
         # Process response and convert to PNG
@@ -201,19 +187,20 @@ def main():
                 if isinstance(image_data, str):
                     # If it's a string, it might be base64
                     import base64
+
                     image_data = base64.b64decode(image_data)
 
                 image = PILImage.open(BytesIO(image_data))
 
                 # Ensure RGB mode for PNG (convert RGBA to RGB with white background if needed)
-                if image.mode == 'RGBA':
-                    rgb_image = PILImage.new('RGB', image.size, (255, 255, 255))
+                if image.mode == "RGBA":
+                    rgb_image = PILImage.new("RGB", image.size, (255, 255, 255))
                     rgb_image.paste(image, mask=image.split()[3])
-                    rgb_image.save(str(output_path), 'PNG')
-                elif image.mode == 'RGB':
-                    image.save(str(output_path), 'PNG')
+                    rgb_image.save(str(output_path), "PNG")
+                elif image.mode == "RGB":
+                    image.save(str(output_path), "PNG")
                 else:
-                    image.convert('RGB').save(str(output_path), 'PNG')
+                    image.convert("RGB").save(str(output_path), "PNG")
                 image_saved = True
 
         if image_saved:
diff --git a/skills/openai-image-gen/scripts/gen.py b/skills/openai-image-gen/scripts/gen.py
index 2d8c7569016..1fd61f0f7a3 100644
--- a/skills/openai-image-gen/scripts/gen.py
+++ b/skills/openai-image-gen/scripts/gen.py
@@ -58,9 +58,7 @@ def pick_prompts(count: int) -> list[str]:
     ]
     prompts: list[str] = []
     for _ in range(count):
-        prompts.append(
-            f"{random.choice(styles)} of {random.choice(subjects)}, {random.choice(lighting)}"
-        )
+        prompts.append(f"{random.choice(styles)} of {random.choice(subjects)}, {random.choice(lighting)}")
     return prompts
 
 
@@ -100,9 +98,7 @@ def normalize_optional_flag(
         value = aliases.get(value, value)
 
     if value not in allowed:
-        raise ValueError(
-            f"Invalid --{flag_name} '{raw_value}'. Allowed values: {allowed_text}."
-        )
+        raise ValueError(f"Invalid --{flag_name} '{raw_value}'. Allowed values: {allowed_text}.")
     return value
 
 
@@ -115,10 +111,7 @@ def normalize_background(model: str, background: str) -> str:
         supported=lambda candidate: candidate.startswith("gpt-image"),
         allowed={"transparent", "opaque", "auto"},
         allowed_text="transparent, opaque, auto",
-        unsupported_message=(
-            "Warning: --background is only supported for gpt-image models; "
-            "ignoring for '{model}'."
-        ),
+        unsupported_message=("Warning: --background is only supported for gpt-image models; ignoring for '{model}'."),
     )
 
 
@@ -131,9 +124,7 @@ def normalize_style(model: str, style: str) -> str:
         supported=lambda candidate: candidate == "dall-e-3",
         allowed={"vivid", "natural"},
         allowed_text="vivid, natural",
-        unsupported_message=(
-            "Warning: --style is only supported for dall-e-3; ignoring for '{model}'."
-        ),
+        unsupported_message=("Warning: --style is only supported for dall-e-3; ignoring for '{model}'."),
     )
 
 
@@ -147,8 +138,7 @@ def normalize_output_format(model: str, output_format: str) -> str:
         allowed={"png", "jpeg", "webp"},
         allowed_text="png, jpeg, webp",
         unsupported_message=(
-            "Warning: --output-format is only supported for gpt-image models; "
-            "ignoring for '{model}'."
+            "Warning: --output-format is only supported for gpt-image models; ignoring for '{model}'."
         ),
         aliases={"jpg": "jpeg"},
     )
@@ -245,9 +235,15 @@ def main() -> int:
     ap.add_argument("--prompt", help="Single prompt. If omitted, random prompts are generated.")
     ap.add_argument("--count", type=int, default=8, help="How many images to generate.")
     ap.add_argument("--model", default="gpt-image-1", help="Image model id.")
-    ap.add_argument("--size", default="", help="Image size (e.g. 1024x1024, 1536x1024). Defaults based on model if not specified.")
-    ap.add_argument("--quality", default="", help="Image quality (e.g. high, standard). Defaults based on model if not specified.")
-    ap.add_argument("--background", default="", help="Background transparency (GPT models only): transparent, opaque, or auto.")
+    ap.add_argument(
+        "--size", default="", help="Image size (e.g. 1024x1024, 1536x1024). Defaults based on model if not specified."
+    )
+    ap.add_argument(
+        "--quality", default="", help="Image quality (e.g. high, standard). Defaults based on model if not specified."
+    )
+    ap.add_argument(
+        "--background", default="", help="Background transparency (GPT models only): transparent, opaque, or auto."
+    )
     ap.add_argument("--output-format", default="", help="Output format (GPT models only): png, jpeg, or webp.")
     ap.add_argument("--style", default="", help="Image style (dall-e-3 only): vivid or natural.")
     ap.add_argument("--out-dir", default="", help="Output directory (default: ./tmp/openai-image-gen-<ts>).")
@@ -265,7 +261,10 @@ def main() -> int:
 
     count = args.count
     if args.model == "dall-e-3" and count > 1:
-        print(f"Warning: dall-e-3 only supports generating 1 image at a time. Reducing count from {count} to 1.", file=sys.stderr)
+        print(
+            f"Warning: dall-e-3 only supports generating 1 image at a time. Reducing count from {count} to 1.",
+            file=sys.stderr,
+        )
         count = 1
 
     out_dir = Path(args.out_dir).expanduser() if args.out_dir else default_out_dir()
diff --git a/skills/openai-image-gen/scripts/test_gen.py b/skills/openai-image-gen/scripts/test_gen.py
index 76445c0bb78..79d418c3e4c 100644
--- a/skills/openai-image-gen/scripts/test_gen.py
+++ b/skills/openai-image-gen/scripts/test_gen.py
@@ -82,6 +82,8 @@ def test_normalize_output_format_normalizes_case_for_supported_values():
 
 def test_normalize_output_format_strips_whitespace_for_supported_values():
     assert normalize_output_format("gpt-image-1", " png ") == "png"
+
+
 def test_normalize_output_format_keeps_supported_values():
     assert normalize_output_format("gpt-image-1", "png") == "png"
     assert normalize_output_format("gpt-image-1", "jpeg") == "jpeg"
diff --git a/skills/skill-creator/scripts/quick_validate.py b/skills/skill-creator/scripts/quick_validate.py
index e8737b4f156..86897b373d7 100644
--- a/skills/skill-creator/scripts/quick_validate.py
+++ b/skills/skill-creator/scripts/quick_validate.py
@@ -6,7 +6,6 @@ Quick validation script for skills - minimal version
 import re
 import sys
 from pathlib import Path
-from typing import Optional
 
 try:
     import yaml
@@ -16,7 +15,7 @@ except ModuleNotFoundError:
 MAX_SKILL_NAME_LENGTH = 64
 
 
-def _extract_frontmatter(content: str) -> Optional[str]:
+def _extract_frontmatter(content: str) -> str | None:
     lines = content.splitlines()
     if not lines or lines[0].strip() != "---":
         return None
@@ -26,13 +25,13 @@ def _extract_frontmatter(content: str) -> Optional[str]:
     return None
 
 
-def _parse_simple_frontmatter(frontmatter_text: str) -> Optional[dict[str, str]]:
+def _parse_simple_frontmatter(frontmatter_text: str) -> dict[str, str] | None:
     """
     Minimal fallback parser used when PyYAML is unavailable.
     Supports simple `key: value` mappings used by SKILL.md frontmatter.
     """
     parsed: dict[str, str] = {}
-    current_key: Optional[str] = None
+    current_key: str | None = None
     for raw_line in frontmatter_text.splitlines():
         stripped = raw_line.strip()
         if not stripped or stripped.startswith("#"):
@@ -43,9 +42,7 @@ def _parse_simple_frontmatter(frontmatter_text: str) -> Optional[dict[str, str]]
             if current_key is None:
                 return None
             current_value = parsed[current_key]
-            parsed[current_key] = (
-                f"{current_value}\n{stripped}" if current_value else stripped
-            )
+            parsed[current_key] = f"{current_value}\n{stripped}" if current_value else stripped
             continue
 
         if ":" not in stripped:
@@ -55,9 +52,7 @@ def _parse_simple_frontmatter(frontmatter_text: str) -> Optional[dict[str, str]]
         value = value.strip()
         if not key:
             return None
-        if (value.startswith('"') and value.endswith('"')) or (
-            value.startswith("'") and value.endswith("'")
-        ):
+        if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
             value = value[1:-1]
         parsed[key] = value
         current_key = key
@@ -129,8 +124,7 @@ def validate_skill(skill_path):
         if len(name) > MAX_SKILL_NAME_LENGTH:
             return (
                 False,
-                f"Name is too long ({len(name)} characters). "
-                f"Maximum is {MAX_SKILL_NAME_LENGTH} characters.",
+                f"Name is too long ({len(name)} characters). Maximum is {MAX_SKILL_NAME_LENGTH} characters.",
             )
 
     description = frontmatter.get("description", "")
diff --git a/src/auto-reply/reply/followup-media.test.ts b/src/auto-reply/reply/followup-media.test.ts
new file mode 100644
index 00000000000..77996f85606
--- /dev/null
+++ b/src/auto-reply/reply/followup-media.test.ts
@@ -0,0 +1,121 @@
+import { describe, expect, it } from "vitest";
+import {
+  _findLastOccurrenceOutsideFileBlocks as findLastOccurrenceOutsideFileBlocks,
+  _normalizeUpdatedBody as normalizeUpdatedBody,
+  _rebuildQueuedPromptWithMediaUnderstanding as rebuildQueuedPromptWithMediaUnderstanding,
+} from "./followup-media.js";
+
+const FILE_BLOCK = '<file name="doc.pdf" type="application/pdf">\nPDF content\n</file>';
+
+describe("findLastOccurrenceOutsideFileBlocks", () => {
+  it("returns -1 for empty search", () => {
+    expect(findLastOccurrenceOutsideFileBlocks("hello", "")).toBe(-1);
+  });
+
+  it("finds last occurrence in body region before file blocks", () => {
+    const value = `hello world hello\n${FILE_BLOCK}`;
+    // "hello" appears at 0 and 12 — both before the file block
+    expect(findLastOccurrenceOutsideFileBlocks(value, "hello")).toBe(12);
+  });
+
+  it("skips matches inside file block content", () => {
+    // "PDF content" appears only inside the file block — no valid match outside.
+    const value = `some text\n${FILE_BLOCK}`;
+    expect(findLastOccurrenceOutsideFileBlocks(value, "PDF content")).toBe(-1);
+  });
+
+  it("finds trailing occurrence outside file block even when also inside one", () => {
+    const value = `some text\n${FILE_BLOCK}\nPDF content`;
+    // "PDF content" appears inside the file block AND after it — the function
+    // should return the trailing occurrence that is outside the block.
+    const expected = value.lastIndexOf("PDF content");
+    expect(findLastOccurrenceOutsideFileBlocks(value, "PDF content")).toBe(expected);
+  });
+
+  it("finds occurrence when search itself contains file blocks", () => {
+    const bodyWithFile = `caption\n${FILE_BLOCK}`;
+    const value = `previous\n${bodyWithFile}\nlater\n${bodyWithFile}`;
+    // Should find the *last* (trailing) occurrence
+    const expected = value.lastIndexOf(bodyWithFile);
+    expect(findLastOccurrenceOutsideFileBlocks(value, bodyWithFile)).toBe(expected);
+    expect(expected).toBeGreaterThan(value.indexOf(bodyWithFile));
+  });
+
+  it("returns index when no file blocks exist in value", () => {
+    expect(findLastOccurrenceOutsideFileBlocks("abc abc", "abc")).toBe(4);
+  });
+
+  it("finds body text after thread-history file blocks", () => {
+    const value = `Thread history\n${FILE_BLOCK}\n\ncheck this out`;
+    // The body "check this out" appears after a file block from thread history.
+    // The old truncation approach would miss this; the new approach finds it.
+    expect(findLastOccurrenceOutsideFileBlocks(value, "check this out")).toBe(
+      value.lastIndexOf("check this out"),
+    );
+  });
+});
+
+describe("normalizeUpdatedBody", () => {
+  it("returns empty string when updatedBody is empty", () => {
+    expect(normalizeUpdatedBody({ originalBody: "foo", updatedBody: "" })).toBe("");
+  });
+
+  it("returns updatedBody when originalBody is empty", () => {
+    expect(normalizeUpdatedBody({ updatedBody: "hello" })).toBe("hello");
+  });
+
+  it("strips directives when updatedBody equals originalBody", () => {
+    const body = "/think high tell me a joke";
+    const result = normalizeUpdatedBody({ originalBody: body, updatedBody: body });
+    expect(result).toBe("tell me a joke");
+  });
+
+  it("does not corrupt file block content during directive cleanup", () => {
+    const originalBody = "/think high tell me about this file";
+    // updatedBody has the original body plus a file block appended by media processing
+    const updatedBody = `${originalBody}\n${FILE_BLOCK}`;
+    const result = normalizeUpdatedBody({ originalBody, updatedBody });
+    // The directive should be stripped from the body portion, file block preserved
+    expect(result).toContain("tell me about this file");
+    expect(result).toContain(FILE_BLOCK);
+    expect(result).not.toContain("/think");
+  });
+
+  it("replaces in body region, not inside file blocks", () => {
+    const originalBody = "PDF content";
+    const updatedBody = `PDF content\n<file name="doc.pdf" type="application/pdf">\nPDF content\n</file>`;
+    // The replacement should target the body region "PDF content" before the
+    // file block, not the "PDF content" inside the <file> block.
+    const result = normalizeUpdatedBody({ originalBody, updatedBody });
+    // With no directives to strip, original === cleaned, updatedBody !== originalBody
+    // because updatedBody has the file block appended.  The replacement targets the
+    // body-region occurrence.
+    expect(result).toContain('<file name="doc.pdf"');
+    expect(result).toContain("PDF content\n</file>");
+  });
+});
+
+describe("rebuildQueuedPromptWithMediaUnderstanding", () => {
+  it("replaces original body with updated body in prompt", () => {
+    const result = rebuildQueuedPromptWithMediaUnderstanding({
+      prompt: "thread context\nhello world",
+      originalBody: "hello world",
+      updatedBody: 'hello world\n<file name="a.pdf">data</file>',
+    });
+    expect(result).toContain('<file name="a.pdf">data</file>');
+    expect(result).toContain("thread context");
+  });
+
+  it("preserves file blocks in thread history when body is replaced", () => {
+    const prompt = `history\n<file name="old.pdf">old</file>\nhello world`;
+    const result = rebuildQueuedPromptWithMediaUnderstanding({
+      prompt,
+      originalBody: "hello world",
+      updatedBody: "hello world transcribed",
+    });
+    // The old file block from history should be preserved since updatedBody
+    // has no file blocks of its own.
+    expect(result).toContain('<file name="old.pdf">old</file>');
+    expect(result).toContain("hello world transcribed");
+  });
+});
diff --git a/src/auto-reply/reply/followup-media.ts b/src/auto-reply/reply/followup-media.ts
new file mode 100644
index 00000000000..f0d5d951683
--- /dev/null
+++ b/src/auto-reply/reply/followup-media.ts
@@ -0,0 +1,423 @@
+import path from "node:path";
+import { logVerbose } from "../../globals.js";
+import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
+import {
+  normalizeAttachments,
+  resolveAttachmentKind,
+} from "../../media-understanding/attachments.js";
+import { buildInboundMediaNote } from "../media-note.js";
+import type { MsgContext } from "../templating.js";
+import { parseInlineDirectives } from "./directive-handling.js";
+import type { FollowupMediaContext, FollowupRun } from "./queue/types.js";
+
+const MEDIA_ONLY_PLACEHOLDER = "[User sent media without caption]";
+const MEDIA_REPLY_HINT_PREFIX = "To send an image back, prefer the message tool";
+const LEADING_MEDIA_ATTACHED_LINE_RE = /^\[media attached(?: \d+\/\d+)?: [^\r\n]*\]$/;
+const FILE_BLOCK_RE = /<file\s+name="/i;
+const FILE_BLOCK_FULL_RE = /<file\s+name="[^"]*"[^>]*>[\s\S]*?<\/file>\n?/gi;
+
+function stripExistingFileBlocks(text: string): string {
+  return text.replace(FILE_BLOCK_FULL_RE, "").trim();
+}
+
+function stripLeadingMediaAttachedLines(prompt: string): string {
+  const lines = prompt.split("\n");
+  let index = 0;
+  while (index < lines.length) {
+    const trimmed = lines[index]?.trim() ?? "";
+    if (!LEADING_MEDIA_ATTACHED_LINE_RE.test(trimmed)) {
+      break;
+    }
+    index += 1;
+  }
+  return lines.slice(index).join("\n").trim();
+}
+
+function stripLeadingMediaReplyHint(prompt: string): string {
+  const lines = prompt.split("\n");
+  if ((lines[0] ?? "").startsWith(MEDIA_REPLY_HINT_PREFIX)) {
+    return lines.slice(1).join("\n").trim();
+  }
+  return prompt.trim();
+}
+
+/** Collect the [start, end) ranges of every `<file …>…</file>` block in `value`. */
+function collectFileBlockRanges(value: string): Array<[number, number]> {
+  const ranges: Array<[number, number]> = [];
+  const re = new RegExp(FILE_BLOCK_FULL_RE.source, FILE_BLOCK_FULL_RE.flags);
+  let m: RegExpExecArray | null;
+  while ((m = re.exec(value)) !== null) {
+    ranges.push([m.index, m.index + m[0].length]);
+  }
+  return ranges;
+}
+
+function isInsideFileBlock(
+  position: number,
+  length: number,
+  ranges: Array<[number, number]>,
+): boolean {
+  for (const [start, end] of ranges) {
+    if (position >= start && position + length <= end) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/**
+ * Find the last occurrence of `search` in `value` that is NOT inside a
+ * `<file …>…</file>` block.  Searches the full string with lastIndexOf,
+ * then walks backward past any matches that fall inside file blocks.
+ */
+function findLastOccurrenceOutsideFileBlocks(value: string, search: string): number {
+  if (!search) {
+    return -1;
+  }
+  const ranges = collectFileBlockRanges(value);
+  let pos = value.lastIndexOf(search);
+  while (pos >= 0 && isInsideFileBlock(pos, search.length, ranges)) {
+    pos = value.lastIndexOf(search, pos - 1);
+  }
+  return pos;
+}
+
+function replaceLastOccurrenceOutsideFileBlocks(
+  value: string,
+  search: string,
+  replacement: string,
+): string | undefined {
+  if (!search) {
+    return undefined;
+  }
+  const index = findLastOccurrenceOutsideFileBlocks(value, search);
+  if (index < 0) {
+    return undefined;
+  }
+  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
+}
+
+function findTrailingReplacementTargetBeforeFileBlocks(
+  value: string,
+  targets: string[],
+): { index: number; target: string } | undefined {
+  let bestMatch: { index: number; target: string } | undefined;
+  for (const target of targets) {
+    const index = findLastOccurrenceOutsideFileBlocks(value, target);
+    if (index < 0) {
+      continue;
+    }
+    if (!bestMatch || index > bestMatch.index) {
+      bestMatch = { index, target };
+    }
+  }
+  return bestMatch;
+}
+
+function replaceOccurrenceAtIndex(
+  value: string,
+  search: string,
+  replacement: string,
+  index: number,
+): string {
+  return `${value.slice(0, index)}${replacement}${value.slice(index + search.length)}`;
+}
+
+function decodeXmlAttr(value: string): string {
+  return value
+    .replace(/&quot;/g, '"')
+    .replace(/&apos;/g, "'")
+    .replace(/&lt;/g, "<")
+    .replace(/&gt;/g, ">")
+    .replace(/&amp;/g, "&");
+}
+
+function extractAttachmentFileName(value?: string): string | undefined {
+  const trimmed = value?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  if (/^[a-zA-Z][a-zA-Z\d+.-]*:/.test(trimmed)) {
+    try {
+      const pathname = new URL(trimmed).pathname;
+      const basename = path.posix.basename(pathname);
+      return basename || undefined;
+    } catch {
+      // Fall back to path-style parsing below.
+    }
+  }
+  const normalized = trimmed.replace(/\\/g, "/");
+  const basename = path.posix.basename(normalized);
+  return basename || undefined;
+}
+
+function bodyContainsMatchingFileBlock(mediaContext: FollowupMediaContext): boolean {
+  const body = mediaContext.Body?.trim();
+  if (!body || !FILE_BLOCK_RE.test(body)) {
+    return false;
+  }
+  const bodyFileNames = new Set<string>();
+  for (const match of body.matchAll(/<file\s+name="([^"]*)"[^>]*>/gi)) {
+    const fileName = match[1]?.trim();
+    if (fileName) {
+      bodyFileNames.add(decodeXmlAttr(fileName));
+    }
+  }
+  if (bodyFileNames.size === 0) {
+    return false;
+  }
+  return normalizeAttachments(mediaContext as MsgContext).some((attachment) => {
+    const fileName = extractAttachmentFileName(attachment.path ?? attachment.url);
+    return Boolean(fileName && bodyFileNames.has(fileName));
+  });
+}
+
+function stripInlineDirectives(text: string | undefined): string {
+  return parseInlineDirectives(text ?? "").cleaned.trim();
+}
+
+function normalizeUpdatedBody(params: { originalBody?: string; updatedBody?: string }): string {
+  const updatedBody = params.updatedBody?.trim();
+  if (!updatedBody) {
+    return "";
+  }
+  const originalBody = params.originalBody?.trim();
+  if (!originalBody) {
+    return updatedBody;
+  }
+
+  const cleanedOriginalBody = stripInlineDirectives(originalBody);
+  if (!cleanedOriginalBody) {
+    return updatedBody;
+  }
+  if (updatedBody === originalBody) {
+    return cleanedOriginalBody;
+  }
+  return (
+    replaceLastOccurrenceOutsideFileBlocks(updatedBody, originalBody, cleanedOriginalBody) ??
+    updatedBody
+  ).trim();
+}
+
+function rebuildQueuedPromptWithMediaUnderstanding(params: {
+  prompt: string;
+  originalBody?: string;
+  updatedBody?: string;
+  mediaNote?: string;
+}): string {
+  let stripped = stripLeadingMediaAttachedLines(params.prompt);
+  if (!params.mediaNote) {
+    stripped = stripLeadingMediaReplyHint(stripped);
+  }
+
+  const replacementTargets = [
+    params.originalBody?.trim(),
+    stripInlineDirectives(params.originalBody),
+    MEDIA_ONLY_PLACEHOLDER,
+  ].filter(
+    (value, index, list): value is string => Boolean(value) && list.indexOf(value) === index,
+  );
+
+  // Strip pre-existing file blocks from the body region when the updated body
+  // contains new file blocks.  Mixed messages (audio + PDF) can arrive with
+  // file extraction already applied in the primary path; without this strip
+  // the old block stays in the prompt while the updated body adds a new one,
+  // duplicating potentially large file payloads.
+  // Scope stripping to the confirmed body segment so quoted/replied text,
+  // thread history above the body, and prompts whose original body no longer
+  // appears all retain any legitimate <file> blocks.
+  if (params.updatedBody && FILE_BLOCK_RE.test(params.updatedBody)) {
+    const trailingMatch = findTrailingReplacementTargetBeforeFileBlocks(
+      stripped,
+      replacementTargets,
+    );
+    if (trailingMatch) {
+      stripped =
+        stripped.slice(0, trailingMatch.index) +
+        stripExistingFileBlocks(stripped.slice(trailingMatch.index));
+    }
+  }
+
+  const updatedBody = normalizeUpdatedBody({
+    originalBody: params.originalBody,
+    updatedBody: params.updatedBody,
+  });
+  if (!updatedBody) {
+    return [params.mediaNote?.trim(), stripped].filter(Boolean).join("\n").trim();
+  }
+
+  let rebuilt = stripped;
+  const trailingMatch = findTrailingReplacementTargetBeforeFileBlocks(rebuilt, replacementTargets);
+  if (trailingMatch) {
+    rebuilt = replaceOccurrenceAtIndex(
+      rebuilt,
+      trailingMatch.target,
+      updatedBody,
+      trailingMatch.index,
+    );
+    return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
+  }
+
+  rebuilt = [rebuilt, updatedBody].filter(Boolean).join("\n\n");
+  return [params.mediaNote?.trim(), rebuilt.trim()].filter(Boolean).join("\n").trim();
+}
+
+function hasMediaAttachments(mediaContext: FollowupMediaContext): boolean {
+  return Boolean(
+    mediaContext.MediaPath?.trim() ||
+    mediaContext.MediaUrl?.trim() ||
+    (Array.isArray(mediaContext.MediaPaths) && mediaContext.MediaPaths.length > 0) ||
+    (Array.isArray(mediaContext.MediaUrls) && mediaContext.MediaUrls.length > 0),
+  );
+}
+
+function hasOnlyFileLikeAttachments(mediaContext: FollowupMediaContext): boolean {
+  const attachments = normalizeAttachments(mediaContext as MsgContext);
+  return (
+    attachments.length > 0 &&
+    attachments.every((attachment) => {
+      const kind = resolveAttachmentKind(attachment);
+      return kind !== "audio" && kind !== "image" && kind !== "video";
+    })
+  );
+}
+
+function hasAnyFileAttachments(mediaContext: FollowupMediaContext): boolean {
+  return normalizeAttachments(mediaContext as MsgContext).some((attachment) => {
+    const kind = resolveAttachmentKind(attachment);
+    return kind !== "audio" && kind !== "image" && kind !== "video";
+  });
+}
+
+function snapshotUpdatedMediaContext(params: {
+  original: FollowupMediaContext;
+  mediaCtx: MsgContext;
+  updatedBody?: string;
+  appliedFile?: boolean;
+}): FollowupMediaContext {
+  return {
+    ...params.original,
+    Body: params.updatedBody ?? params.original.Body,
+    Transcript:
+      typeof params.mediaCtx.Transcript === "string"
+        ? params.mediaCtx.Transcript
+        : params.original.Transcript,
+    MediaUnderstanding: Array.isArray(params.mediaCtx.MediaUnderstanding)
+      ? [...params.mediaCtx.MediaUnderstanding]
+      : params.original.MediaUnderstanding,
+    MediaUnderstandingDecisions: Array.isArray(params.mediaCtx.MediaUnderstandingDecisions)
+      ? [...params.mediaCtx.MediaUnderstandingDecisions]
+      : params.original.MediaUnderstandingDecisions,
+    DeferredMediaApplied: true,
+    DeferredFileBlocksExtracted:
+      params.original.DeferredFileBlocksExtracted || params.appliedFile || undefined,
+  };
+}
+
+// Exported for unit testing — these are pure string helpers with no side effects.
+export {
+  findLastOccurrenceOutsideFileBlocks as _findLastOccurrenceOutsideFileBlocks,
+  normalizeUpdatedBody as _normalizeUpdatedBody,
+  rebuildQueuedPromptWithMediaUnderstanding as _rebuildQueuedPromptWithMediaUnderstanding,
+};
+
+export async function applyDeferredMediaUnderstandingToQueuedRun(
+  queued: FollowupRun,
+  params: { logLabel?: string } = {},
+): Promise<void> {
+  // NOTE: collect-mode and overflow-summary queue drains create synthetic
+  // followup runs without mediaContext — those paths are not covered here
+  // and rely on their own prompt-building logic in queue/drain.ts.
+  const mediaContext = queued.mediaContext;
+  if (!mediaContext || mediaContext.DeferredMediaApplied) {
+    return;
+  }
+  if (!hasMediaAttachments(mediaContext)) {
+    mediaContext.DeferredMediaApplied = true;
+    return;
+  }
+  const referenceBody = mediaContext.RawBody ?? mediaContext.Body;
+  // Prefer RawBody-vs-Body comparison when RawBody exists. If RawBody is
+  // missing, fall back to explicit file-extraction signals instead of
+  // re-running extraction just because the clean pre-extraction body is gone.
+  if (!mediaContext.DeferredFileBlocksExtracted && hasAnyFileAttachments(mediaContext)) {
+    const rawBodyMissing = typeof mediaContext.RawBody !== "string";
+    if (mediaContext.Body !== referenceBody) {
+      mediaContext.DeferredFileBlocksExtracted = true;
+    } else if (
+      rawBodyMissing &&
+      (Boolean(mediaContext.MediaUnderstanding?.length) ||
+        bodyContainsMatchingFileBlock(mediaContext))
+    ) {
+      mediaContext.DeferredFileBlocksExtracted = true;
+    }
+  }
+  if (mediaContext.MediaUnderstanding?.length) {
+    mediaContext.DeferredMediaApplied = true;
+    return;
+  }
+
+  if (mediaContext.DeferredFileBlocksExtracted && hasOnlyFileLikeAttachments(mediaContext)) {
+    mediaContext.DeferredMediaApplied = true;
+    return;
+  }
+
+  const resolvedOriginalBody =
+    mediaContext.CommandBody ?? mediaContext.RawBody ?? mediaContext.Body;
+
+  try {
+    const mediaCtx = {
+      ...mediaContext,
+      Body: resolvedOriginalBody,
+      Provider:
+        mediaContext.Provider ??
+        queued.run.messageProvider ??
+        (typeof mediaContext.OriginatingChannel === "string"
+          ? mediaContext.OriginatingChannel
+          : undefined),
+      Surface: mediaContext.Surface,
+    } as MsgContext;
+
+    const muResult = await applyMediaUnderstanding({
+      ctx: mediaCtx,
+      cfg: queued.run.config,
+      agentDir: queued.run.agentDir,
+      activeModel: {
+        provider: queued.run.provider,
+        model: queued.run.model,
+      },
+    });
+
+    const shouldRebuildPrompt =
+      muResult.outputs.length > 0 ||
+      muResult.appliedAudio ||
+      muResult.appliedImage ||
+      muResult.appliedVideo ||
+      (muResult.appliedFile && !mediaContext.DeferredFileBlocksExtracted);
+
+    if (shouldRebuildPrompt) {
+      const newMediaNote = buildInboundMediaNote(mediaCtx);
+      queued.prompt = rebuildQueuedPromptWithMediaUnderstanding({
+        prompt: queued.prompt,
+        originalBody: resolvedOriginalBody,
+        updatedBody: mediaCtx.Body,
+        mediaNote: newMediaNote,
+      });
+      logVerbose(
+        `${params.logLabel ?? "followup"}: applied media understanding (audio=${muResult.appliedAudio}, image=${muResult.appliedImage}, video=${muResult.appliedVideo}, file=${muResult.appliedFile})`,
+      );
+    }
+
+    queued.mediaContext = snapshotUpdatedMediaContext({
+      original: mediaContext,
+      mediaCtx,
+      updatedBody: shouldRebuildPrompt ? mediaCtx.Body : undefined,
+      appliedFile: muResult.appliedFile,
+    });
+  } catch (err) {
+    mediaContext.DeferredMediaApplied = true;
+    logVerbose(
+      `${params.logLabel ?? "followup"}: media understanding failed, proceeding with raw content: ${err instanceof Error ? err.message : String(err)}`,
+    );
+  }
+}
diff --git a/src/auto-reply/reply/followup-runner.test.ts b/src/auto-reply/reply/followup-runner.test.ts
index c8e33397a2a..ed076b4da59 100644
--- a/src/auto-reply/reply/followup-runner.test.ts
+++ b/src/auto-reply/reply/followup-runner.test.ts
@@ -2,13 +2,15 @@ import fs from "node:fs/promises";
 import { tmpdir } from "node:os";
 import path from "node:path";
 import { beforeEach, describe, expect, it, vi } from "vitest";
-import { loadSessionStore, saveSessionStore, type SessionEntry } from "../../config/sessions.js";
-import type { FollowupRun } from "./queue.js";
+import { loadSessionStore, type SessionEntry, saveSessionStore } from "../../config/sessions.js";
+import { buildCollectPrompt } from "../../utils/queue-helpers.js";
+import type { FollowupRun } from "./queue/types.js";
 import { createMockFollowupRun, createMockTypingController } from "./test-helpers.js";
 
 const runEmbeddedPiAgentMock = vi.fn();
 const routeReplyMock = vi.fn();
 const isRoutableChannelMock = vi.fn();
+const applyMediaUnderstandingMock = vi.fn();
 
 vi.mock(
   "../../agents/model-fallback.js",
@@ -19,6 +21,10 @@ vi.mock("../../agents/pi-embedded.js", () => ({
   runEmbeddedPiAgent: (params: unknown) => runEmbeddedPiAgentMock(params),
 }));
 
+vi.mock("../../media-understanding/apply.js", () => ({
+  applyMediaUnderstanding: (params: unknown) => applyMediaUnderstandingMock(params),
+}));
+
 vi.mock("./route-reply.js", async (importOriginal) => {
   const actual = await importOriginal<typeof import("./route-reply.js")>();
   return {
@@ -29,6 +35,10 @@ vi.mock("./route-reply.js", async (importOriginal) => {
 });
 
 import { createFollowupRunner } from "./followup-runner.js";
+import {
+  applyDeferredMediaToQueuedRuns,
+  buildMediaAwareQueueSummaryPrompt,
+} from "./queue/drain.js";
 
 const ROUTABLE_TEST_CHANNELS = new Set([
   "telegram",
@@ -47,13 +57,27 @@ beforeEach(() => {
   isRoutableChannelMock.mockImplementation((ch: string | undefined) =>
     Boolean(ch?.trim() && ROUTABLE_TEST_CHANNELS.has(ch.trim().toLowerCase())),
   );
+  applyMediaUnderstandingMock.mockReset();
+  applyMediaUnderstandingMock.mockResolvedValue({
+    outputs: [],
+    decisions: [],
+    appliedImage: false,
+    appliedAudio: false,
+    appliedVideo: false,
+    appliedFile: false,
+  });
 });
 
 const baseQueuedRun = (messageProvider = "whatsapp"): FollowupRun =>
   createMockFollowupRun({ run: { messageProvider } });
 
+const MEDIA_REPLY_HINT =
+  "To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body.";
+
 function createQueuedRun(
-  overrides: Partial<Omit<FollowupRun, "run">> & { run?: Partial<FollowupRun["run"]> } = {},
+  overrides: Partial<Omit<FollowupRun, "run">> & {
+    run?: Partial<FollowupRun["run"]>;
+  } = {},
 ): FollowupRun {
   return createMockFollowupRun(overrides);
 }
@@ -398,7 +422,12 @@ describe("createFollowupRunner messaging tool dedupe", () => {
       agentResult: {
         ...makeTextReplyDedupeResult(),
         messagingToolSentTargets: [
-          { tool: "telegram", provider: "telegram", to: "268300329", accountId: "work" },
+          {
+            tool: "telegram",
+            provider: "telegram",
+            to: "268300329",
+            accountId: "work",
+          },
         ],
       },
       queued: {
@@ -448,8 +477,13 @@ describe("createFollowupRunner messaging tool dedupe", () => {
       "sessions.json",
     );
     const sessionKey = "main";
-    const sessionEntry: SessionEntry = { sessionId: "session", updatedAt: Date.now() };
-    const sessionStore: Record<string, SessionEntry> = { [sessionKey]: sessionEntry };
+    const sessionEntry: SessionEntry = {
+      sessionId: "session",
+      updatedAt: Date.now(),
+    };
+    const sessionStore: Record<string, SessionEntry> = {
+      [sessionKey]: sessionEntry,
+    };
     await saveSessionStore(storePath, sessionStore);
 
     const { onBlockReply } = await runMessagingCase({
@@ -643,7 +677,1712 @@ describe("createFollowupRunner agentDir forwarding", () => {
     });
 
     expect(runEmbeddedPiAgentMock).toHaveBeenCalledTimes(1);
-    const call = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as { agentDir?: string };
+    const call = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      agentDir?: string;
+    };
     expect(call?.agentDir).toBe(agentDir);
   });
 });
+
+describe("createFollowupRunner media understanding", () => {
+  it("applies audio transcription when mediaContext has untranscribed audio", async () => {
+    const transcriptText = "Hello, this is a voice note.";
+    // The real applyMediaUnderstanding mutates the ctx; the mock must do the same
+    // so buildInboundMediaNote and queued prompt rebuilding see the transcribed body.
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "Got it!" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text",
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+        // MediaUnderstanding is empty — transcription not yet applied
+      },
+    });
+    await runner(queued);
+
+    // applyMediaUnderstanding should have been called
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        cfg: queued.run.config,
+        agentDir: queued.run.agentDir,
+      }),
+    );
+
+    // The prompt passed to the agent should include the transcript, not the
+    // raw audio attachment line.
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).not.toContain("[media attached: /tmp/voice.ogg");
+
+    expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "Got it!" }));
+  });
+
+  it("propagates the queued message provider into deferred media context", async () => {
+    const transcriptText = "Provider-aware transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        expect(params.ctx.Provider).toBe("telegram");
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "done" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[User sent media without caption]",
+        run: { messageProvider: "telegram" },
+        mediaContext: {
+          Body: "",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+  });
+
+  it("applies media understanding for URL-only attachments", async () => {
+    const transcriptText = "URL-only transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "Got it!" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: https://cdn.example.com/voice.ogg (audio/ogg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          MediaUrl: "https://cdn.example.com/voice.ogg",
+          MediaUrls: ["https://cdn.example.com/voice.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+  });
+
+  it("strips the full media line when attachment paths or URLs contain brackets", async () => {
+    const transcriptText = "Bracket-safe transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "done" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/voice[0].ogg"],
+          MediaUrls: ["https://cdn.example.com/files[0].ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).not.toContain("[media attached:");
+    expect(agentCall?.prompt).not.toContain("files[0].ogg]");
+  });
+
+  it("only strips leading synthetic media lines and preserves literal user text later in the prompt", async () => {
+    const transcriptText = "Transcript with literal token";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = "I literally typed [media attached: keep me] in this message.";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "done" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/voice.ogg (audio/ogg)]\nI literally typed [media attached: keep me] in this message.",
+        mediaContext: {
+          Body: "I literally typed [media attached: keep me] in this message.",
+          CommandBody: "I literally typed [media attached: keep me] in this message.",
+          RawBody: "I literally typed [media attached: keep me] in this message.",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(
+      "I literally typed [media attached: keep me] in this message.",
+    );
+    expect(agentCall?.prompt).not.toContain("[media attached: /tmp/voice.ogg (audio/ogg)]");
+  });
+
+  it("skips media understanding when MediaUnderstanding is already populated", async () => {
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "reply" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt: "[Audio]\nTranscript:\nAlready transcribed.\n\nsome text",
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+        // MediaUnderstanding already populated — transcription was applied in primary path
+        MediaUnderstanding: [
+          {
+            kind: "audio.transcription",
+            text: "Already transcribed.",
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ],
+      },
+    });
+    await runner(queued);
+
+    // Should NOT re-run media understanding
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+
+    // The original prompt should be passed through unchanged
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("Already transcribed.");
+  });
+
+  it("skips media understanding when no mediaContext is present", async () => {
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "reply" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // No mediaContext (plain text message)
+    const queued = createQueuedRun({ prompt: "just text" });
+    await runner(queued);
+
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+  });
+
+  it("continues with raw prompt when media understanding fails", async () => {
+    applyMediaUnderstandingMock.mockRejectedValueOnce(new Error("transcription service down"));
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "fallback reply" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const originalPrompt = "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text";
+    const queued = createQueuedRun({
+      prompt: originalPrompt,
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+      },
+    });
+    await runner(queued);
+
+    // Should have attempted media understanding
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+
+    // Agent should still run with the original prompt
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toBe(originalPrompt);
+
+    expect(onBlockReply).toHaveBeenCalledWith(expect.objectContaining({ text: "fallback reply" }));
+  });
+
+  it("rebuilds the prompt when file extraction succeeds without media outputs", async () => {
+    const fileBlock = '<file name="notes.txt" mime="text/plain">\nline one\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `some text\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "file processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/notes.txt (text/plain)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[media attached: /tmp/notes.txt (text/plain)]");
+    expect(agentCall?.prompt).toContain(fileBlock);
+    expect(agentCall?.prompt?.match(/<file\b/g)).toHaveLength(1);
+  });
+
+  it("preserves non-audio media lines when only audio is transcribed", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        // Simulate transcription updating the context
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: "voice transcript",
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = "voice transcript";
+        params.ctx.Body = "[Audio]\nUser text:\nsome text\nTranscript:\nvoice transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "voice transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "got both" }],
+      meta: {},
+    });
+
+    const onBlockReply = vi.fn(async () => {});
+    const runner = createFollowupRunner({
+      opts: { onBlockReply },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt:
+        "[media attached: 2 files]\n[media attached 1/2: /tmp/voice.ogg (audio/ogg)]\n[media attached 2/2: /tmp/photo.jpg (image/jpeg)]\nsome text",
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg", "/tmp/photo.jpg"],
+        MediaTypes: ["audio/ogg", "image/jpeg"],
+      },
+    });
+    await runner(queued);
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    // Audio attachment line should be stripped
+    expect(agentCall?.prompt).not.toContain("voice.ogg");
+    // Image attachment line should also be stripped (all media-attached lines are
+    // removed and replaced by the new buildInboundMediaNote output)
+    // The transcript should be present
+    expect(agentCall?.prompt).toContain("voice transcript");
+  });
+
+  it("strips queued media lines when attachment paths or URLs contain a literal closing bracket", async () => {
+    const transcriptText = "Bracket-safe transcript";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nUser text:\nsome text\nTranscript:\n${transcriptText}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "ok" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          "[media attached: /tmp/voice[0].ogg (audio/ogg) | https://cdn.example.com/files[0].ogg?sig=abc]123]\n" +
+          MEDIA_REPLY_HINT +
+          "\n" +
+          "some text",
+        mediaContext: {
+          Body: "some text",
+          MediaPaths: ["/tmp/voice[0].ogg"],
+          MediaUrls: ["https://cdn.example.com/files[0].ogg?sig=abc]123"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).not.toContain("/tmp/voice[0].ogg");
+    expect(agentCall?.prompt).not.toContain("https://cdn.example.com/files[0].ogg?sig=abc]123");
+    expect(agentCall?.prompt).not.toContain(MEDIA_REPLY_HINT);
+  });
+
+  it("preserves file-only media understanding when outputs are empty", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body =
+          '<file name="report.pdf" mime="application/pdf">\nQuarterly report body\n</file>';
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\n[User sent media without caption]`,
+        mediaContext: {
+          Body: "",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[media attached: /tmp/report.pdf (application/pdf)]");
+    expect(agentCall?.prompt).toContain(MEDIA_REPLY_HINT);
+    expect(agentCall?.prompt).toContain('<file name="report.pdf" mime="application/pdf">');
+    expect(agentCall?.prompt).toContain("Quarterly report body");
+    expect(agentCall?.prompt).not.toContain("[User sent media without caption]");
+  });
+
+  it("replaces the queued body when inline directives were already stripped from the prompt", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body =
+          '/think high summarize this\n\n<file name="report.pdf" mime="application/pdf">\nreport\n</file>';
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).toContain('<file name="report.pdf" mime="application/pdf">');
+    expect(agentCall?.prompt).not.toContain("summarize this\n\n/think high summarize this");
+    expect(agentCall?.prompt).not.toContain("/think high summarize this");
+  });
+
+  it("preserves directive-like tokens inside extracted media content", async () => {
+    const fileBlock =
+      '<file name="notes.txt" mime="text/plain">\n/model claude-opus should stay\n/queue followup should stay\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `/think high summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).not.toContain("/think high summarize this");
+    expect(agentCall?.prompt).toContain("/model claude-opus should stay");
+    expect(agentCall?.prompt).toContain("/queue followup should stay");
+  });
+
+  it("rebuilds the prompt when image understanding mutates the body without outputs", async () => {
+    const description = "[Image]\nDescription:\na mountain at sunset";
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = description;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: true,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/photo.jpg (image/jpeg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          MediaPaths: ["/tmp/photo.jpg"],
+          MediaTypes: ["image/jpeg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("a mountain at sunset");
+  });
+
+  it("does not false-positive on user text containing literal '<file' when extracting files", async () => {
+    const fileBlock = '<file name="data.csv" mime="text/csv">\ncol1,col2\n1,2\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `check my <file upload please\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "got it" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // User message contains literal "<file" text but that should NOT prevent
+    // file extraction results from being embedded in the prompt.
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/data.csv (text/csv)]\ncheck my <file upload please",
+        mediaContext: {
+          Body: "check my <file upload please",
+          CommandBody: "check my <file upload please",
+          RawBody: "check my <file upload please",
+          MediaPaths: ["/tmp/data.csv"],
+          MediaTypes: ["text/csv"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    // The file extraction result should be present in the prompt
+    expect(agentCall?.prompt).toContain(fileBlock);
+    expect(agentCall?.prompt).toContain("check my <file upload please");
+  });
+
+  it("preserves directive-like text that appears inside extracted file content", async () => {
+    const fileBlock =
+      '<file name="notes.txt" mime="text/plain">\nRun `/think high` literally in the shell example.\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = `summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/notes.txt]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          CommandBody: "summarize this",
+          RawBody: "/think high summarize this",
+          MediaPaths: ["/tmp/notes.txt"],
+          MediaTypes: ["text/plain"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("summarize this");
+    expect(agentCall?.prompt).toContain("Run `/think high` literally in the shell example.");
+  });
+
+  it("rebuilds the prompt when image understanding mutates the body alongside existing text", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = "some text\n\n[Image summary]\nA whiteboard with action items.";
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: true,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[media attached: /tmp/board.jpg (image/jpeg)]\nsome text",
+        mediaContext: {
+          Body: "some text",
+          CommandBody: "some text",
+          RawBody: "some text",
+          MediaPaths: ["/tmp/board.jpg"],
+          MediaTypes: ["image/jpeg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("[Image summary]");
+    expect(agentCall?.prompt).toContain("A whiteboard with action items.");
+  });
+
+  it("applies media understanding for URL-only deferred attachments", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.Body = "[Audio]\nTranscript:\nremote transcript";
+        params.ctx.Transcript = "remote transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "remote transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: "[User sent media without caption]",
+        mediaContext: {
+          Body: "",
+          MediaUrl: "https://cdn.example.com/audio.ogg",
+          MediaUrls: ["https://cdn.example.com/audio.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(1);
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("remote transcript");
+  });
+
+  it("uses resolved body (CommandBody) as originalBody for accurate prompt replacement", async () => {
+    const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        // applyMediaUnderstanding mutates the resolved body (which is CommandBody)
+        params.ctx.Body = `summarize this\n\n${fileBlock}`;
+        return {
+          outputs: [],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: false,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // Body has directive prefix; CommandBody has the cleaned version.
+    // The prompt was built from CommandBody, so originalBody should match CommandBody
+    // for accurate replacement.
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this`,
+        mediaContext: {
+          Body: "/think high summarize this",
+          CommandBody: "summarize this",
+          RawBody: "/think high summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    // File block should be present (extraction succeeded)
+    expect(agentCall?.prompt).toContain(fileBlock);
+    // The body text should appear once, not duplicated
+    expect(agentCall?.prompt).toContain("summarize this");
+    // Should NOT contain the directive prefix
+    expect(agentCall?.prompt).not.toContain("/think high");
+    // The body should not be duplicated (would happen if originalBody didn't match)
+    const matches = agentCall?.prompt?.match(/summarize this/g);
+    expect(matches?.length).toBe(1);
+  });
+
+  it("does not duplicate file blocks for mixed audio+file messages re-processed in followup", async () => {
+    const existingFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nold extracted content\n</file>';
+    const newFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nnew extracted content\n</file>';
+    const transcriptText = "Mixed message transcript";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nanalyze this\n\n${newFileBlock}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // Simulate a mixed message where the primary path already extracted the
+    // PDF (file block is in the prompt) but audio transcription failed.
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nanalyze this\n\n${existingFileBlock}`,
+        mediaContext: {
+          Body: `analyze this\n\n${existingFileBlock}`,
+          CommandBody: "analyze this",
+          RawBody: "analyze this",
+          MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"],
+          MediaTypes: ["audio/ogg", "application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    // Should contain the transcript
+    expect(agentCall?.prompt).toContain(transcriptText);
+    // Should have exactly one file block (the new one), not two
+    expect(agentCall?.prompt?.match(/<file\s+name="report\.pdf"/g)).toHaveLength(1);
+    expect(agentCall?.prompt).toContain("new extracted content");
+    expect(agentCall?.prompt).not.toContain("old extracted content");
+  });
+
+  it("preserves unrelated file blocks when the original body is absent from the prompt", async () => {
+    const quotedFileBlock =
+      '<file name="thread.pdf" mime="application/pdf">\nquoted thread attachment\n</file>';
+    const existingFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nold extracted content\n</file>';
+    const newFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nnew extracted content\n</file>';
+    const transcriptText = "Transcript from deferred audio";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this\n\n${newFileBlock}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nQuoted thread above\n\n${quotedFileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${existingFileBlock}`,
+          CommandBody: "summarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"],
+          MediaTypes: ["audio/ogg", "application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("Quoted thread above");
+    expect(agentCall?.prompt).toContain(quotedFileBlock);
+    expect(agentCall?.prompt).toContain(newFileBlock);
+    expect(agentCall?.prompt?.match(/<file\s+name="/g)).toHaveLength(2);
+  });
+
+  it("replaces the visible body before file blocks instead of matching file content", async () => {
+    const existingFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nsummary notes:\nsummarize this\n</file>';
+    const transcriptText = "Transcript from deferred audio";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${existingFileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${existingFileBlock}`,
+          CommandBody: "summarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"],
+          MediaTypes: ["audio/ogg", "application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    const transcriptBlock = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this`;
+    expect(agentCall?.prompt).toContain(existingFileBlock);
+    expect(agentCall?.prompt).toContain(transcriptBlock);
+    expect(agentCall?.prompt?.indexOf(transcriptBlock)).toBeGreaterThan(-1);
+    expect(agentCall?.prompt?.indexOf(transcriptBlock)).toBeLessThan(
+      agentCall?.prompt?.indexOf(existingFileBlock) ?? -1,
+    );
+  });
+
+  it("finds the body after thread-history file blocks when body appears after the first <file> tag", async () => {
+    const threadFileBlock =
+      '<file name="thread.pdf" mime="application/pdf">\nolder thread attachment\n</file>';
+    const transcriptText = "Transcript from deferred voice note";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\ncheck this out`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    // The prompt has thread history with a file block BEFORE the current
+    // queued body text.  The old truncation approach would miss the body
+    // because it only searched before the first <file> tag.
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/voice.ogg]\n${MEDIA_REPLY_HINT}\nThread history\n\n${threadFileBlock}\n\ncheck this out`,
+        mediaContext: {
+          Body: "check this out",
+          RawBody: "check this out",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    const transcriptBlock = `[Audio]\nTranscript:\n${transcriptText}\n\ncheck this out`;
+    // The body should be replaced with the transcript block
+    expect(agentCall?.prompt).toContain(transcriptBlock);
+    // Thread history and its file block should be preserved
+    expect(agentCall?.prompt).toContain("Thread history");
+    expect(agentCall?.prompt).toContain(threadFileBlock);
+  });
+
+  it("sets DeferredMediaApplied when media understanding throws", async () => {
+    applyMediaUnderstandingMock.mockRejectedValueOnce(
+      new Error("transcription service unavailable"),
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "fallback reply" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    const queued = createQueuedRun({
+      prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text",
+      mediaContext: {
+        Body: "some text",
+        MediaPaths: ["/tmp/voice.ogg"],
+        MediaTypes: ["audio/ogg"],
+      },
+    });
+
+    await runner(queued);
+
+    // DeferredMediaApplied should be set so re-runs don't retry
+    expect(queued.mediaContext?.DeferredMediaApplied).toBe(true);
+
+    // The agent should still be called with the raw prompt
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("some text");
+  });
+
+  it("does not re-apply file extraction when the stored media body already has a file block", async () => {
+    const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${fileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${fileBlock}`,
+          CommandBody: "summarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt?.match(/<file\s+name="report\.pdf"/g)).toHaveLength(1);
+  });
+
+  it("does not re-apply file extraction when RawBody is missing but Body already has a matching file block", async () => {
+    const fileBlock = '<file name="report.pdf" mime="application/pdf">\nreport content\n</file>';
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt: `[media attached: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nsummarize this\n\n${fileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${fileBlock}`,
+          CommandBody: "summarize this",
+          MediaPaths: ["/tmp/report.pdf"],
+          MediaTypes: ["application/pdf"],
+        },
+      }),
+    );
+
+    expect(applyMediaUnderstandingMock).not.toHaveBeenCalled();
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain(fileBlock);
+  });
+
+  it("replaces the trailing repeated body segment instead of the first matching thread text", async () => {
+    const existingFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nold extracted content\n</file>';
+    const newFileBlock =
+      '<file name="report.pdf" mime="application/pdf">\nnew extracted content\n</file>';
+    const transcriptText = "Deferred transcript";
+
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: transcriptText,
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = transcriptText;
+        params.ctx.Body = `[Audio]\nTranscript:\n${transcriptText}\n\nsummarize this\n\n${newFileBlock}`;
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: transcriptText,
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: true,
+        };
+      },
+    );
+    runEmbeddedPiAgentMock.mockResolvedValueOnce({
+      payloads: [{ text: "processed" }],
+      meta: {},
+    });
+
+    const runner = createFollowupRunner({
+      opts: { onBlockReply: vi.fn(async () => {}) },
+      typing: createMockTypingController(),
+      typingMode: "instant",
+      defaultModel: "anthropic/claude-opus-4-5",
+    });
+
+    await runner(
+      createQueuedRun({
+        prompt:
+          `[media attached 1/2: /tmp/voice.ogg]\n[media attached 2/2: /tmp/report.pdf]\n${MEDIA_REPLY_HINT}\nThread history: summarize this\n\n` +
+          `summarize this\n\n${existingFileBlock}`,
+        mediaContext: {
+          Body: `summarize this\n\n${existingFileBlock}`,
+          CommandBody: "summarize this",
+          RawBody: "summarize this",
+          MediaPaths: ["/tmp/voice.ogg", "/tmp/report.pdf"],
+          MediaTypes: ["audio/ogg", "application/pdf"],
+        },
+      }),
+    );
+
+    const agentCall = runEmbeddedPiAgentMock.mock.calls.at(-1)?.[0] as {
+      prompt?: string;
+    };
+    expect(agentCall?.prompt).toContain("Thread history: summarize this");
+    expect(agentCall?.prompt).toContain(transcriptText);
+    expect(agentCall?.prompt).toContain(newFileBlock);
+    expect(agentCall?.prompt).not.toContain("old extracted content");
+    expect(agentCall?.prompt?.indexOf(newFileBlock)).toBeGreaterThan(
+      agentCall?.prompt?.lastIndexOf("summarize this") ?? -1,
+    );
+  });
+});
+
+describe("followup queue drain deferred media understanding", () => {
+  it("preprocesses collect batches before synthesizing the followup prompt", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: "collect transcript",
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = "collect transcript";
+        params.ctx.Body = "[Audio]\nTranscript:\ncollect transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "collect transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    const items: FollowupRun[] = [
+      createQueuedRun({
+        prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]\nsome text",
+        summaryLine: "some text",
+        originatingChannel: "telegram",
+        originatingTo: "chat:1",
+        run: { messageProvider: "telegram" },
+        mediaContext: {
+          Body: "some text",
+          MediaPaths: ["/tmp/voice.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+      createQueuedRun({
+        prompt: "second text",
+        summaryLine: "second text",
+        originatingChannel: "telegram",
+        originatingTo: "chat:1",
+        run: { messageProvider: "telegram" },
+      }),
+    ];
+
+    await applyDeferredMediaToQueuedRuns(items);
+
+    const prompt = buildCollectPrompt({
+      title: "[Queued messages while agent was busy]",
+      items,
+      renderItem: (item, idx) => `---\nQueued #${idx + 1}\n${item.prompt}`.trim(),
+    });
+
+    expect(prompt).toContain("collect transcript");
+    expect(prompt).toContain("Queued #2\nsecond text");
+    expect(prompt).not.toContain("[media attached: /tmp/voice.ogg");
+  });
+
+  it("preprocesses queued runs in parallel", async () => {
+    const resolvers: Array<() => void> = [];
+    const done = () => ({
+      outputs: [],
+      decisions: [],
+      appliedImage: false,
+      appliedAudio: false,
+      appliedVideo: false,
+      appliedFile: false,
+    });
+    applyMediaUnderstandingMock.mockImplementation(
+      async () =>
+        await new Promise((resolve) => {
+          resolvers.push(() => resolve(done()));
+        }),
+    );
+
+    const items: FollowupRun[] = [
+      createQueuedRun({
+        prompt: "[media attached: /tmp/voice-1.ogg (audio/ogg)]\nfirst text",
+        summaryLine: "first text",
+        run: { messageProvider: "telegram" },
+        mediaContext: {
+          Body: "first text",
+          MediaPaths: ["/tmp/voice-1.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+      createQueuedRun({
+        prompt: "[media attached: /tmp/voice-2.ogg (audio/ogg)]\nsecond text",
+        summaryLine: "second text",
+        run: { messageProvider: "telegram" },
+        mediaContext: {
+          Body: "second text",
+          MediaPaths: ["/tmp/voice-2.ogg"],
+          MediaTypes: ["audio/ogg"],
+        },
+      }),
+    ];
+
+    const pending = applyDeferredMediaToQueuedRuns(items);
+
+    expect(applyMediaUnderstandingMock).toHaveBeenCalledTimes(2);
+
+    for (const resolve of resolvers) {
+      resolve();
+    }
+    await pending;
+  });
+
+  it("preprocesses dropped media items before building overflow summaries", async () => {
+    applyMediaUnderstandingMock.mockImplementationOnce(
+      async (params: { ctx: Record<string, unknown> }) => {
+        params.ctx.MediaUnderstanding = [
+          {
+            kind: "audio.transcription",
+            text: "overflow transcript",
+            attachmentIndex: 0,
+            provider: "whisper",
+          },
+        ];
+        params.ctx.Transcript = "overflow transcript";
+        params.ctx.Body = "[Audio]\nTranscript:\noverflow transcript";
+        return {
+          outputs: [
+            {
+              kind: "audio.transcription",
+              text: "overflow transcript",
+              attachmentIndex: 0,
+              provider: "whisper",
+            },
+          ],
+          decisions: [],
+          appliedImage: false,
+          appliedAudio: true,
+          appliedVideo: false,
+          appliedFile: false,
+        };
+      },
+    );
+    const summaryPrompt = await buildMediaAwareQueueSummaryPrompt({
+      dropPolicy: "summarize",
+      droppedCount: 1,
+      summaryLines: ["[media attached: /tmp/voice.ogg (audio/ogg)]"],
+      summaryItems: [
+        createQueuedRun({
+          prompt: "[media attached: /tmp/voice.ogg (audio/ogg)]",
+          summaryLine: "",
+          run: { messageProvider: "telegram" },
+          mediaContext: {
+            Body: "",
+            MediaPaths: ["/tmp/voice.ogg"],
+            MediaTypes: ["audio/ogg"],
+          },
+        }),
+      ],
+      noun: "message",
+    });
+
+    expect(summaryPrompt).toContain("[Queue overflow] Dropped 1 message due to cap.");
+    expect(summaryPrompt).toContain("overflow transcript");
+    expect(summaryPrompt).not.toContain("[media attached: /tmp/voice.ogg");
+  });
+});
diff --git a/src/auto-reply/reply/followup-runner.ts b/src/auto-reply/reply/followup-runner.ts
index fe90d56433c..50925cd1269 100644
--- a/src/auto-reply/reply/followup-runner.ts
+++ b/src/auto-reply/reply/followup-runner.ts
@@ -16,6 +16,7 @@ import type { OriginatingChannelType } from "../templating.js";
 import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../tokens.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveRunAuthProfile } from "./agent-runner-utils.js";
+import { applyDeferredMediaUnderstandingToQueuedRun } from "./followup-media.js";
 import {
   resolveOriginAccountId,
   resolveOriginMessageProvider,
@@ -154,6 +155,8 @@ export function createFollowupRunner(params: {
       let bootstrapPromptWarningSignaturesSeen = resolveBootstrapWarningSignaturesSeen(
         activeSessionEntry?.systemPromptReport,
       );
+      await applyDeferredMediaUnderstandingToQueuedRun(queued, { logLabel: "followup" });
+
       try {
         const fallbackResult = await runWithModelFallback({
           cfg: queued.run.config,
diff --git a/src/auto-reply/reply/get-reply-run.media-only.test.ts b/src/auto-reply/reply/get-reply-run.media-only.test.ts
index 829b3937009..f519da10082 100644
--- a/src/auto-reply/reply/get-reply-run.media-only.test.ts
+++ b/src/auto-reply/reply/get-reply-run.media-only.test.ts
@@ -172,6 +172,45 @@ describe("runPreparedReply media-only handling", () => {
     expect(call?.followupRun.prompt).toContain("[User sent media without caption]");
   });
 
+  it("snapshots URL-only attachments into followup mediaContext", async () => {
+    await runPreparedReply(
+      baseParams({
+        ctx: {
+          Body: "check this attachment",
+          RawBody: "check this attachment",
+          CommandBody: "check this attachment",
+          ThreadHistoryBody: "Earlier message in this thread",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+          ChatType: "group",
+          MediaUrl: "https://cdn.example.com/input.png",
+          MediaUrls: ["https://cdn.example.com/input.png"],
+          MediaType: "image/png",
+          MediaTypes: ["image/png"],
+        },
+        sessionCtx: {
+          Body: "check this attachment",
+          BodyStripped: "check this attachment",
+          ThreadHistoryBody: "Earlier message in this thread",
+          Provider: "slack",
+          ChatType: "group",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+        },
+      }),
+    );
+
+    const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
+    expect(call?.followupRun.mediaContext).toEqual(
+      expect.objectContaining({
+        MediaUrl: "https://cdn.example.com/input.png",
+        MediaUrls: ["https://cdn.example.com/input.png"],
+        MediaType: "image/png",
+        MediaTypes: ["image/png"],
+      }),
+    );
+  });
+
   it("keeps thread history context on follow-up turns", async () => {
     const result = await runPreparedReply(
       baseParams({
@@ -186,6 +225,41 @@ describe("runPreparedReply media-only handling", () => {
     expect(call?.followupRun.prompt).toContain("Earlier message in this thread");
   });
 
+  it("snapshots mediaContext for URL-only deferred attachments", async () => {
+    await runPreparedReply(
+      baseParams({
+        ctx: {
+          Body: "",
+          RawBody: "",
+          CommandBody: "",
+          MediaUrl: "https://cdn.example.com/audio.ogg",
+          MediaUrls: ["https://cdn.example.com/audio.ogg"],
+          MediaType: "audio/ogg",
+          MediaTypes: ["audio/ogg"],
+          ThreadHistoryBody: "Earlier message in this thread",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+          ChatType: "group",
+        },
+        sessionCtx: {
+          Body: "",
+          BodyStripped: "",
+          ThreadHistoryBody: "Earlier message in this thread",
+          Provider: "slack",
+          ChatType: "group",
+          OriginatingChannel: "slack",
+          OriginatingTo: "C123",
+        },
+      }),
+    );
+
+    const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
+    expect(call?.followupRun.mediaContext?.MediaUrl).toBe("https://cdn.example.com/audio.ogg");
+    expect(call?.followupRun.mediaContext?.MediaUrls).toEqual([
+      "https://cdn.example.com/audio.ogg",
+    ]);
+  });
+
   it("returns the empty-body reply when there is no text and no media", async () => {
     const result = await runPreparedReply(
       baseParams({
diff --git a/src/auto-reply/reply/get-reply-run.ts b/src/auto-reply/reply/get-reply-run.ts
index 760c42aed1a..92ca1be7513 100644
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -307,7 +307,14 @@ export async function runPreparedReply(
     : [inboundUserContext, baseBodyFinal].filter(Boolean).join("\n\n");
   const baseBodyTrimmed = baseBodyForPrompt.trim();
   const hasMediaAttachment = Boolean(
-    sessionCtx.MediaPath || (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0),
+    sessionCtx.MediaPath ||
+    sessionCtx.MediaUrl ||
+    (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0) ||
+    (sessionCtx.MediaUrls && sessionCtx.MediaUrls.length > 0) ||
+    ctx.MediaPath?.trim() ||
+    ctx.MediaUrl?.trim() ||
+    (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
+    (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
   );
   if (!baseBodyTrimmed && !hasMediaAttachment) {
     await typing.onReplyStart();
@@ -384,7 +391,7 @@ export async function runPreparedReply(
   const mediaReplyHint = mediaNote
     ? "To send an image back, prefer the message tool (media/path/filePath). If you must inline, use MEDIA:https://example.com/image.jpg (spaces ok, quote if needed) or a safe relative path like MEDIA:./image.jpg. Avoid absolute paths (MEDIA:/...) and ~ paths — they are blocked for security. Keep caption in the text body."
     : undefined;
-  let prefixedCommandBody = mediaNote
+  const prefixedCommandBody = mediaNote
     ? [mediaNote, mediaReplyHint, prefixedBody ?? ""].filter(Boolean).join("\n").trim()
     : prefixedBody;
   if (!resolvedThinkLevel) {
@@ -469,11 +476,48 @@ export async function runPreparedReply(
     isNewSession,
   });
   const authProfileIdSource = sessionEntry?.authProfileOverrideSource;
+  // Snapshot media-related context for deferred media understanding in the
+  // followup runner.  When MediaUnderstanding is already populated the runner
+  // knows transcription already succeeded and skips re-application.
+  const hasMediaAttachments = Boolean(
+    ctx.MediaPath?.trim() ||
+    ctx.MediaUrl?.trim() ||
+    (Array.isArray(ctx.MediaPaths) && ctx.MediaPaths.length > 0) ||
+    (Array.isArray(ctx.MediaUrls) && ctx.MediaUrls.length > 0),
+  );
+  const mediaContext = hasMediaAttachments
+    ? {
+        Body: ctx.Body,
+        CommandBody: ctx.CommandBody,
+        RawBody: ctx.RawBody,
+        Provider: ctx.Provider ?? sessionCtx.Provider,
+        Surface: ctx.Surface ?? sessionCtx.Surface,
+        MediaPath: ctx.MediaPath,
+        MediaUrl: ctx.MediaUrl,
+        MediaType: ctx.MediaType,
+        MediaDir: ctx.MediaDir,
+        MediaPaths: ctx.MediaPaths ? [...ctx.MediaPaths] : undefined,
+        MediaUrls: ctx.MediaUrls ? [...ctx.MediaUrls] : undefined,
+        MediaTypes: ctx.MediaTypes ? [...ctx.MediaTypes] : undefined,
+        MediaRemoteHost: ctx.MediaRemoteHost,
+        Transcript: ctx.Transcript,
+        MediaUnderstanding: ctx.MediaUnderstanding ? [...ctx.MediaUnderstanding] : undefined,
+        MediaUnderstandingDecisions: ctx.MediaUnderstandingDecisions
+          ? [...ctx.MediaUnderstandingDecisions]
+          : undefined,
+        OriginatingChannel: ctx.OriginatingChannel,
+        OriginatingTo: ctx.OriginatingTo,
+        AccountId: ctx.AccountId,
+        MessageThreadId: ctx.MessageThreadId,
+      }
+    : undefined;
+
   const followupRun = {
     prompt: queuedBody,
     messageId: sessionCtx.MessageSidFull ?? sessionCtx.MessageSid,
     summaryLine: baseBodyTrimmedRaw,
     enqueuedAt: Date.now(),
+    mediaContext,
     // Originating channel for reply routing.
     originatingChannel: ctx.OriginatingChannel,
     originatingTo: ctx.OriginatingTo,
diff --git a/src/auto-reply/reply/queue.ts b/src/auto-reply/reply/queue.ts
index b097b6c5193..5c3a56f64af 100644
--- a/src/auto-reply/reply/queue.ts
+++ b/src/auto-reply/reply/queue.ts
@@ -1,6 +1,6 @@
-export { extractQueueDirective } from "./queue/directive.js";
-export { clearSessionQueues } from "./queue/cleanup.js";
 export type { ClearSessionQueueResult } from "./queue/cleanup.js";
+export { clearSessionQueues } from "./queue/cleanup.js";
+export { extractQueueDirective } from "./queue/directive.js";
 export { scheduleFollowupDrain } from "./queue/drain.js";
 export {
   enqueueFollowupRun,
@@ -10,6 +10,7 @@ export {
 export { resolveQueueSettings } from "./queue/settings.js";
 export { clearFollowupQueue } from "./queue/state.js";
 export type {
+  FollowupMediaContext,
   FollowupRun,
   QueueDedupeMode,
   QueueDropPolicy,
diff --git a/src/auto-reply/reply/queue/drain.ts b/src/auto-reply/reply/queue/drain.ts
index 1e2fb33e4e0..41c859d2ec3 100644
--- a/src/auto-reply/reply/queue/drain.ts
+++ b/src/auto-reply/reply/queue/drain.ts
@@ -3,15 +3,17 @@ import { resolveGlobalMap } from "../../../shared/global-singleton.js";
 import {
   buildCollectPrompt,
   beginQueueDrain,
+  buildQueueSummaryLine,
+  buildQueueSummaryPrompt,
   clearQueueSummaryState,
   drainCollectQueueStep,
   drainNextQueueItem,
   hasCrossChannelItems,
-  previewQueueSummaryPrompt,
   waitForQueueDebounce,
 } from "../../../utils/queue-helpers.js";
+import { applyDeferredMediaUnderstandingToQueuedRun } from "../followup-media.js";
 import { isRoutableChannel } from "../route-reply.js";
-import { FOLLOWUP_QUEUES } from "./state.js";
+import { FOLLOWUP_QUEUES, type FollowupQueueState } from "./state.js";
 import type { FollowupRun } from "./types.js";
 
 // Persists the most recent runFollowup callback per queue key so that
@@ -68,6 +70,60 @@ function resolveCrossChannelKey(item: FollowupRun): { cross?: true; key?: string
   };
 }
 
+function clearFollowupQueueSummaryState(queue: FollowupQueueState): void {
+  clearQueueSummaryState(queue);
+  queue.summaryItems = [];
+}
+
+export async function applyDeferredMediaToQueuedRuns(items: FollowupRun[]): Promise<void> {
+  await Promise.allSettled(
+    items.map(
+      async (item) =>
+        await applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }),
+    ),
+  );
+}
+
+async function resolveSummaryLines(items: FollowupRun[]): Promise<string[]> {
+  // Parallelize the media understanding API calls upfront (same pattern as
+  // applyDeferredMediaToQueuedRuns), then build summary lines sequentially
+  // so line order matches the original item order.
+  await Promise.allSettled(
+    items.map((item) =>
+      applyDeferredMediaUnderstandingToQueuedRun(item, { logLabel: "followup queue" }),
+    ),
+  );
+  // After deferred media, prefer the updated prompt (which includes transcripts)
+  // over the original summaryLine (which may just be the caption text).
+  return items.map((item) =>
+    buildQueueSummaryLine(item.prompt.trim() || item.summaryLine?.trim() || ""),
+  );
+}
+
+export async function buildMediaAwareQueueSummaryPrompt(params: {
+  dropPolicy: FollowupQueueState["dropPolicy"];
+  droppedCount: number;
+  summaryLines: string[];
+  summaryItems: FollowupRun[];
+  noun: string;
+}): Promise<string | undefined> {
+  if (params.dropPolicy !== "summarize" || params.droppedCount <= 0) {
+    return undefined;
+  }
+  const summaryLines =
+    params.summaryItems.length > 0
+      ? await resolveSummaryLines(params.summaryItems)
+      : params.summaryLines;
+  return buildQueueSummaryPrompt({
+    state: {
+      dropPolicy: params.dropPolicy,
+      droppedCount: params.droppedCount,
+      summaryLines: [...summaryLines],
+    },
+    noun: params.noun,
+  });
+}
+
 export function scheduleFollowupDrain(
   key: string,
   runFollowup: (run: FollowupRun) => Promise<void>,
@@ -107,7 +163,14 @@ export function scheduleFollowupDrain(
           }
 
           const items = queue.items.slice();
-          const summary = previewQueueSummaryPrompt({ state: queue, noun: "message" });
+          await applyDeferredMediaToQueuedRuns(items);
+          const summary = await buildMediaAwareQueueSummaryPrompt({
+            dropPolicy: queue.dropPolicy,
+            droppedCount: queue.droppedCount,
+            summaryLines: queue.summaryLines,
+            summaryItems: queue.summaryItems,
+            noun: "message",
+          });
           const run = items.at(-1)?.run ?? queue.lastRun;
           if (!run) {
             break;
@@ -129,12 +192,18 @@ export function scheduleFollowupDrain(
           });
           queue.items.splice(0, items.length);
           if (summary) {
-            clearQueueSummaryState(queue);
+            clearFollowupQueueSummaryState(queue);
           }
           continue;
         }
 
-        const summaryPrompt = previewQueueSummaryPrompt({ state: queue, noun: "message" });
+        const summaryPrompt = await buildMediaAwareQueueSummaryPrompt({
+          dropPolicy: queue.dropPolicy,
+          droppedCount: queue.droppedCount,
+          summaryLines: queue.summaryLines,
+          summaryItems: queue.summaryItems,
+          noun: "message",
+        });
         if (summaryPrompt) {
           const run = queue.lastRun;
           if (!run) {
@@ -155,7 +224,7 @@ export function scheduleFollowupDrain(
           ) {
             break;
           }
-          clearQueueSummaryState(queue);
+          clearFollowupQueueSummaryState(queue);
           continue;
         }
 
diff --git a/src/auto-reply/reply/queue/enqueue.ts b/src/auto-reply/reply/queue/enqueue.ts
index 11da0db98fc..e58cc5ffac5 100644
--- a/src/auto-reply/reply/queue/enqueue.ts
+++ b/src/auto-reply/reply/queue/enqueue.ts
@@ -1,8 +1,8 @@
 import { createDedupeCache } from "../../../infra/dedupe.js";
 import { resolveGlobalSingleton } from "../../../shared/global-singleton.js";
-import { applyQueueDropPolicy, shouldSkipQueueItem } from "../../../utils/queue-helpers.js";
+import { buildQueueSummaryLine, shouldSkipQueueItem } from "../../../utils/queue-helpers.js";
 import { kickFollowupDrainIfIdle } from "./drain.js";
-import { getExistingFollowupQueue, getFollowupQueue } from "./state.js";
+import { getExistingFollowupQueue, getFollowupQueue, type FollowupQueueState } from "./state.js";
 import type { FollowupRun, QueueDedupeMode, QueueSettings } from "./types.js";
 
 /**
@@ -57,6 +57,34 @@ function isRunAlreadyQueued(
   return items.some((item) => item.prompt === run.prompt && hasSameRouting(item));
 }
 
+function applyFollowupQueueDropPolicy(queue: FollowupQueueState): boolean {
+  const cap = queue.cap;
+  if (cap <= 0 || queue.items.length < cap) {
+    return true;
+  }
+  if (queue.dropPolicy === "new") {
+    return false;
+  }
+
+  const dropCount = queue.items.length - cap + 1;
+  const dropped = queue.items.splice(0, dropCount);
+  if (queue.dropPolicy === "summarize") {
+    for (const item of dropped) {
+      queue.droppedCount += 1;
+      queue.summaryItems.push(item);
+      queue.summaryLines.push(
+        buildQueueSummaryLine(item.summaryLine?.trim() || item.prompt.trim()),
+      );
+    }
+    const limit = Math.max(0, cap);
+    while (queue.summaryLines.length > limit) {
+      queue.summaryLines.shift();
+      queue.summaryItems.shift();
+    }
+  }
+  return true;
+}
+
 export function enqueueFollowupRun(
   key: string,
   run: FollowupRun,
@@ -83,10 +111,7 @@ export function enqueueFollowupRun(
   queue.lastEnqueuedAt = Date.now();
   queue.lastRun = run.run;
 
-  const shouldEnqueue = applyQueueDropPolicy({
-    queue,
-    summarize: (item) => item.summaryLine?.trim() || item.prompt.trim(),
-  });
+  const shouldEnqueue = applyFollowupQueueDropPolicy(queue);
   if (!shouldEnqueue) {
     return false;
   }
diff --git a/src/auto-reply/reply/queue/state.ts b/src/auto-reply/reply/queue/state.ts
index 44208e727dd..94021dd0c4c 100644
--- a/src/auto-reply/reply/queue/state.ts
+++ b/src/auto-reply/reply/queue/state.ts
@@ -4,6 +4,7 @@ import type { FollowupRun, QueueDropPolicy, QueueMode, QueueSettings } from "./t
 
 export type FollowupQueueState = {
   items: FollowupRun[];
+  summaryItems: FollowupRun[];
   draining: boolean;
   lastEnqueuedAt: number;
   mode: QueueMode;
@@ -47,6 +48,7 @@ export function getFollowupQueue(key: string, settings: QueueSettings): Followup
 
   const created: FollowupQueueState = {
     items: [],
+    summaryItems: [],
     draining: false,
     lastEnqueuedAt: 0,
     mode: settings.mode,
@@ -78,6 +80,7 @@ export function clearFollowupQueue(key: string): number {
   }
   const cleared = queue.items.length + queue.droppedCount;
   queue.items.length = 0;
+  queue.summaryItems.length = 0;
   queue.droppedCount = 0;
   queue.summaryLines = [];
   queue.lastRun = undefined;
diff --git a/src/auto-reply/reply/queue/types.ts b/src/auto-reply/reply/queue/types.ts
index 507f77d499d..e1e9e20e5c8 100644
--- a/src/auto-reply/reply/queue/types.ts
+++ b/src/auto-reply/reply/queue/types.ts
@@ -2,6 +2,10 @@ import type { ExecToolDefaults } from "../../../agents/bash-tools.js";
 import type { SkillSnapshot } from "../../../agents/skills.js";
 import type { OpenClawConfig } from "../../../config/config.js";
 import type { SessionEntry } from "../../../config/sessions.js";
+import type {
+  MediaUnderstandingDecision,
+  MediaUnderstandingOutput,
+} from "../../../media-understanding/types.js";
 import type { InputProvenance } from "../../../sessions/input-provenance.js";
 import type { OriginatingChannelType } from "../../templating.js";
 import type { ElevatedLevel, ReasoningLevel, ThinkLevel, VerboseLevel } from "../directives.js";
@@ -19,12 +23,55 @@ export type QueueSettings = {
 
 export type QueueDedupeMode = "message-id" | "prompt" | "none";
 
+/**
+ * Snapshot of media-related context fields carried on a FollowupRun so that
+ * the followup runner can apply media understanding (e.g. voice-note
+ * transcription) when it was not applied — or failed — in the primary path.
+ */
+export type FollowupMediaContext = {
+  Body?: string;
+  CommandBody?: string;
+  RawBody?: string;
+  Provider?: string;
+  Surface?: string;
+  MediaPath?: string;
+  MediaUrl?: string;
+  MediaType?: string;
+  MediaDir?: string;
+  MediaPaths?: string[];
+  MediaUrls?: string[];
+  MediaTypes?: string[];
+  MediaRemoteHost?: string;
+  Transcript?: string;
+  MediaUnderstanding?: MediaUnderstandingOutput[];
+  MediaUnderstandingDecisions?: MediaUnderstandingDecision[];
+  OriginatingChannel?: OriginatingChannelType;
+  OriginatingTo?: string;
+  AccountId?: string;
+  MessageThreadId?: string | number;
+  DeferredMediaApplied?: boolean;
+  /**
+   * Set when file extraction has already been applied to Body (either in the
+   * primary path or by a previous deferred-media run).  Checked instead of
+   * scanning body text for `<file` patterns to avoid false-positives on user
+   * messages that contain literal XML-like text.
+   */
+  DeferredFileBlocksExtracted?: boolean;
+};
+
 export type FollowupRun = {
   prompt: string;
   /** Provider message ID, when available (for deduplication). */
   messageId?: string;
   summaryLine?: string;
   enqueuedAt: number;
+  /**
+   * Media context snapshot from the original inbound message.
+   * When present and MediaUnderstanding is empty, the followup runner will
+   * attempt to apply media understanding (audio transcription, etc.) before
+   * passing the prompt to the agent.
+   */
+  mediaContext?: FollowupMediaContext;
   /**
    * Originating channel for reply routing.
    * When set, replies should be routed back to this provider