fix(slack): reconnect socket mode after disconnect (#27232)

* fix(slack): reconnect socket mode after disconnect

* fix(slack): avoid orphaned disconnect waiters on start failure

* docs(changelog): record slack socket reconnect reliability fix

---------

Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
Miguel Miranda Dias 2026-03-01 17:42:45 +01:00 committed by GitHub
parent a54b85822c
commit 949faff5ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 208 additions and 10 deletions

View File

@ -98,6 +98,7 @@ Docs: https://docs.openclaw.ai
- Slack/Usage footer formatting: wrap session keys in inline code in full response-usage footers so Slack does not parse colon-delimited session segments as emoji shortcodes. (#30258) Thanks @pushkarsingh32.
- Slack/Socket Mode slash startup: treat `app.options()` registration as best-effort and fall back to static arg menus when listener registration fails, preventing Slack monitor startup crash loops on receiver init edge cases. (#21715)
- Slack/Legacy streaming config: map boolean `channels.slack.streaming=false` to unified streaming mode `off` (with `nativeStreaming=false`) so legacy configs correctly disable draft preview/native streaming instead of defaulting to `partial`. (#25990) Thanks @chilu18.
- Slack/Socket reconnect reliability: reconnect Socket Mode after disconnect/start failures using bounded exponential backoff with abort-aware waits, while preserving clean shutdown behavior and adding disconnect/error helper tests. (#27232) Thanks @pandego.
- Onboarding/Custom providers: raise default custom-provider model context window to the runtime hard minimum (16k) and auto-heal existing custom model entries below that threshold during reconfiguration, preventing immediate `Model context window too small (4096 tokens)` failures. (#21653) Thanks @r4jiv007.
- Web UI/Assistant text: strip internal `<relevant-memories>...</relevant-memories>` scaffolding from rendered assistant messages (while preserving code-fence literals), preventing memory-context leakage in chat output for models that echo internal blocks. (#29851) Thanks @Valkster70.
- Dashboard/Sessions: allow authenticated Control UI clients to delete and patch sessions while still blocking regular webchat clients from session mutation RPCs, fixing Dashboard session delete failures. (#21264) Thanks @jskoiz.

View File

@ -0,0 +1,45 @@
import { describe, expect, it } from "vitest";
import { __testing } from "./provider.js";
class FakeEmitter {
private listeners = new Map<string, Set<(...args: unknown[]) => void>>();
on(event: string, listener: (...args: unknown[]) => void) {
const bucket = this.listeners.get(event) ?? new Set<(...args: unknown[]) => void>();
bucket.add(listener);
this.listeners.set(event, bucket);
}
off(event: string, listener: (...args: unknown[]) => void) {
this.listeners.get(event)?.delete(listener);
}
emit(event: string, ...args: unknown[]) {
for (const listener of this.listeners.get(event) ?? []) {
listener(...args);
}
}
}
describe("slack socket reconnect helpers", () => {
it("resolves disconnect waiter on socket disconnect event", async () => {
const client = new FakeEmitter();
const app = { receiver: { client } };
const waiter = __testing.waitForSlackSocketDisconnect(app as never);
client.emit("disconnected");
await expect(waiter).resolves.toEqual({ event: "disconnect" });
});
it("resolves disconnect waiter on socket error event", async () => {
const client = new FakeEmitter();
const app = { receiver: { client } };
const err = new Error("dns down");
const waiter = __testing.waitForSlackSocketDisconnect(app as never);
client.emit("error", err);
await expect(waiter).resolves.toEqual({ event: "error", error: err });
});
});

View File

@ -18,6 +18,7 @@ import {
} from "../../config/runtime-group-policy.js";
import type { SessionScope } from "../../config/sessions.js";
import { warn } from "../../globals.js";
import { computeBackoff, sleepWithAbort } from "../../infra/backoff.js";
import { installRequestBodyLimitGuard } from "../../infra/http-body.js";
import { normalizeMainKey } from "../../routing/session-key.js";
import { createNonExitingRuntime, type RuntimeEnv } from "../../runtime.js";
@ -46,6 +47,100 @@ const { App, HTTPReceiver } = slackBolt;
const SLACK_WEBHOOK_MAX_BODY_BYTES = 1024 * 1024;
const SLACK_WEBHOOK_BODY_TIMEOUT_MS = 30_000;
const SLACK_SOCKET_RECONNECT_POLICY = {
initialMs: 2_000,
maxMs: 30_000,
factor: 1.8,
jitter: 0.25,
maxAttempts: 12,
} as const;
type SlackSocketDisconnectEvent = "disconnect" | "unable_to_socket_mode_start" | "error";
type EmitterLike = {
on: (event: string, listener: (...args: unknown[]) => void) => unknown;
off: (event: string, listener: (...args: unknown[]) => void) => unknown;
};
function getSocketEmitter(app: unknown): EmitterLike | null {
const receiver = (app as { receiver?: unknown }).receiver;
const client =
receiver && typeof receiver === "object"
? (receiver as { client?: unknown }).client
: undefined;
if (!client || typeof client !== "object") {
return null;
}
const on = (client as { on?: unknown }).on;
const off = (client as { off?: unknown }).off;
if (typeof on !== "function" || typeof off !== "function") {
return null;
}
return {
on: (event, listener) =>
(
on as (this: unknown, event: string, listener: (...args: unknown[]) => void) => unknown
).call(client, event, listener),
off: (event, listener) =>
(
off as (this: unknown, event: string, listener: (...args: unknown[]) => void) => unknown
).call(client, event, listener),
};
}
function waitForSlackSocketDisconnect(
app: unknown,
abortSignal?: AbortSignal,
): Promise<{
event: SlackSocketDisconnectEvent;
error?: unknown;
}> {
return new Promise((resolve) => {
const emitter = getSocketEmitter(app);
if (!emitter) {
abortSignal?.addEventListener("abort", () => resolve({ event: "disconnect" }), {
once: true,
});
return;
}
const disconnectListener = () => resolveOnce({ event: "disconnect" });
const startFailListener = () => resolveOnce({ event: "unable_to_socket_mode_start" });
const errorListener = (error: unknown) => resolveOnce({ event: "error", error });
const abortListener = () => resolveOnce({ event: "disconnect" });
const cleanup = () => {
emitter.off("disconnected", disconnectListener);
emitter.off("unable_to_socket_mode_start", startFailListener);
emitter.off("error", errorListener);
abortSignal?.removeEventListener("abort", abortListener);
};
const resolveOnce = (value: { event: SlackSocketDisconnectEvent; error?: unknown }) => {
cleanup();
resolve(value);
};
emitter.on("disconnected", disconnectListener);
emitter.on("unable_to_socket_mode_start", startFailListener);
emitter.on("error", errorListener);
abortSignal?.addEventListener("abort", abortListener, { once: true });
});
}
function formatUnknownError(error: unknown): string {
if (error instanceof Error) {
return error.message;
}
if (typeof error === "string") {
return error;
}
try {
return JSON.stringify(error);
} catch {
return "unknown error";
}
}
function parseApiAppIdFromAppToken(raw?: string) {
const token = raw?.trim();
@ -362,19 +457,74 @@ export async function monitorSlackProvider(opts: MonitorSlackOpts = {}) {
try {
if (slackMode === "socket") {
await app.start();
runtime.log?.("slack socket mode connected");
let reconnectAttempts = 0;
while (!opts.abortSignal?.aborted) {
try {
await app.start();
reconnectAttempts = 0;
runtime.log?.("slack socket mode connected");
} catch (err) {
reconnectAttempts += 1;
if (
SLACK_SOCKET_RECONNECT_POLICY.maxAttempts > 0 &&
reconnectAttempts >= SLACK_SOCKET_RECONNECT_POLICY.maxAttempts
) {
throw err;
}
const delayMs = computeBackoff(SLACK_SOCKET_RECONNECT_POLICY, reconnectAttempts);
runtime.error?.(
`slack socket mode failed to start. retry ${reconnectAttempts}/${SLACK_SOCKET_RECONNECT_POLICY.maxAttempts || "∞"} in ${Math.round(delayMs / 1000)}s (${formatUnknownError(err)})`,
);
try {
await sleepWithAbort(delayMs, opts.abortSignal);
} catch {
break;
}
continue;
}
if (opts.abortSignal?.aborted) {
break;
}
const disconnect = await waitForSlackSocketDisconnect(app, opts.abortSignal);
if (opts.abortSignal?.aborted) {
break;
}
reconnectAttempts += 1;
if (
SLACK_SOCKET_RECONNECT_POLICY.maxAttempts > 0 &&
reconnectAttempts >= SLACK_SOCKET_RECONNECT_POLICY.maxAttempts
) {
throw new Error(
`Slack socket mode reconnect max attempts reached (${reconnectAttempts}/${SLACK_SOCKET_RECONNECT_POLICY.maxAttempts}) after ${disconnect.event}`,
);
}
const delayMs = computeBackoff(SLACK_SOCKET_RECONNECT_POLICY, reconnectAttempts);
runtime.error?.(
`slack socket disconnected (${disconnect.event}). retry ${reconnectAttempts}/${SLACK_SOCKET_RECONNECT_POLICY.maxAttempts || "∞"} in ${Math.round(delayMs / 1000)}s${
disconnect.error ? ` (${formatUnknownError(disconnect.error)})` : ""
}`,
);
await app.stop().catch(() => undefined);
try {
await sleepWithAbort(delayMs, opts.abortSignal);
} catch {
break;
}
}
} else {
runtime.log?.(`slack http mode listening at ${slackWebhookPath}`);
if (!opts.abortSignal?.aborted) {
await new Promise<void>((resolve) => {
opts.abortSignal?.addEventListener("abort", () => resolve(), {
once: true,
});
});
}
}
if (opts.abortSignal?.aborted) {
return;
}
await new Promise<void>((resolve) => {
opts.abortSignal?.addEventListener("abort", () => resolve(), {
once: true,
});
});
} finally {
opts.abortSignal?.removeEventListener("abort", stopOnAbort);
unregisterHttpHandler?.();
@ -385,4 +535,6 @@ export async function monitorSlackProvider(opts: MonitorSlackOpts = {}) {
export const __testing = {
resolveSlackRuntimeGroupPolicy: resolveOpenProviderRuntimeGroupPolicy,
resolveDefaultGroupPolicy,
getSocketEmitter,
waitForSlackSocketDisconnect,
};