From 2f86ae71d574f3f17b7001ea6ef6ae78b8b872ff Mon Sep 17 00:00:00 2001
From: Vignesh Natarajan <vigneshnatarajan92@gmail.com>
Date: Thu, 5 Mar 2026 19:03:56 -0800
Subject: [PATCH] fix(subagents): recover announce cleanup after kill/complete
 race

---
 CHANGELOG.md                                  |  1 +
 .../subagent-registry.steer-restart.test.ts   | 32 +++++++++++++++++++
 src/agents/subagent-registry.ts               | 13 ++++++++
 3 files changed, 46 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4a726bb1b25..f53c2aa5a08 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -37,6 +37,7 @@ Docs: https://docs.openclaw.ai
 - Onboarding/API key input hardening: strip non-Latin1 Unicode artifacts from normalized secret input (while preserving Latin-1 content and internal spaces) so malformed copied API keys cannot trigger HTTP header `ByteString` construction crashes; adds regression coverage for shared normalization and MiniMax auth header usage. (#24496) Thanks @fa6maalassaf.
 - Kimi Coding/Anthropic tools compatibility: normalize `anthropic-messages` tool payloads to OpenAI-style `tools[].function` + compatible `tool_choice` when targeting Kimi Coding endpoints, restoring tool-call workflows that regressed after v2026.3.2. (#37038) Thanks @mochimochimochi-hub.
 - Heartbeat/workspace-path guardrails: append explicit workspace `HEARTBEAT.md` path guidance (and `docs/heartbeat.md` avoidance) to heartbeat prompts so heartbeat runs target workspace checklists reliably across packaged install layouts. (#37037) Thanks @stofancy.
+- Subagents/kill-complete announce race: when a late `subagent-complete` lifecycle event arrives after an earlier kill marker, clear stale kill suppression/cleanup flags and re-run announce cleanup so finished runs no longer get silently swallowed. (#37024) Thanks @cmfinlan.
 - Gateway/remote WS break-glass hostname support: honor `OPENCLAW_ALLOW_INSECURE_PRIVATE_WS=1` for `ws://` hostname URLs (not only private IP literals) across onboarding validation and runtime gateway connection checks, while still rejecting public IP literals and non-unicast IPv6 endpoints. (#36930) Thanks @manju-rn.
 - Routing/binding lookup scalability: pre-index route bindings by channel/account and avoid full binding-list rescans on channel-account cache rollover, preventing multi-second `resolveAgentRoute` stalls in large binding configurations. (#36915) Thanks @songchenghao.
 - Browser/session cleanup: track browser tabs opened by session-scoped browser tool runs and close tracked tabs during `sessions.reset`/`sessions.delete` runtime cleanup, preventing orphaned tabs and unbounded browser memory growth after session teardown. (#36666) Thanks @Harnoor6693.
diff --git a/src/agents/subagent-registry.steer-restart.test.ts b/src/agents/subagent-registry.steer-restart.test.ts
index 9ad20be4719..19ef8228688 100644
--- a/src/agents/subagent-registry.steer-restart.test.ts
+++ b/src/agents/subagent-registry.steer-restart.test.ts
@@ -447,6 +447,38 @@ describe("subagent registry steer restarts", () => {
     );
   });
 
+  it("recovers announce cleanup when completion arrives after a kill marker", async () => {
+    const childSessionKey = "agent:main:subagent:kill-race";
+    registerRun({
+      runId: "run-kill-race",
+      childSessionKey,
+      task: "race test",
+    });
+
+    expect(mod.markSubagentRunTerminated({ runId: "run-kill-race", reason: "manual kill" })).toBe(
+      1,
+    );
+    expect(listMainRuns()[0]?.suppressAnnounceReason).toBe("killed");
+    expect(listMainRuns()[0]?.cleanupHandled).toBe(true);
+    expect(typeof listMainRuns()[0]?.cleanupCompletedAt).toBe("number");
+
+    emitLifecycleEnd("run-kill-race");
+    await flushAnnounce();
+    await flushAnnounce();
+
+    expect(announceSpy).toHaveBeenCalledTimes(1);
+    const announce = (announceSpy.mock.calls[0]?.[0] ?? {}) as { childRunId?: string };
+    expect(announce.childRunId).toBe("run-kill-race");
+
+    const run = listMainRuns()[0];
+    expect(run?.endedReason).toBe("subagent-complete");
+    expect(run?.outcome?.status).not.toBe("error");
+    expect(run?.suppressAnnounceReason).toBeUndefined();
+    expect(run?.cleanupHandled).toBe(true);
+    expect(typeof run?.cleanupCompletedAt).toBe("number");
+    expect(runSubagentEndedHookMock).toHaveBeenCalledTimes(1);
+  });
+
   it("retries deferred parent cleanup after a descendant announces", async () => {
     let parentAttempts = 0;
     announceSpy.mockImplementation(async (params: unknown) => {
diff --git a/src/agents/subagent-registry.ts b/src/agents/subagent-registry.ts
index 900aa4752d9..f2d9b81cd5a 100644
--- a/src/agents/subagent-registry.ts
+++ b/src/agents/subagent-registry.ts
@@ -338,6 +338,19 @@ async function completeSubagentRun(params: {
   }
 
   let mutated = false;
+  // If a late lifecycle completion arrives after an earlier kill marker, allow
+  // completion cleanup/announce to run instead of staying permanently suppressed.
+  if (
+    params.reason === SUBAGENT_ENDED_REASON_COMPLETE &&
+    entry.suppressAnnounceReason === "killed" &&
+    (entry.cleanupHandled || typeof entry.cleanupCompletedAt === "number")
+  ) {
+    entry.suppressAnnounceReason = undefined;
+    entry.cleanupHandled = false;
+    entry.cleanupCompletedAt = undefined;
+    mutated = true;
+  }
+
   const endedAt = typeof params.endedAt === "number" ? params.endedAt : Date.now();
   if (entry.endedAt !== endedAt) {
     entry.endedAt = endedAt;