diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c2b98df354..3fde0f44159 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Gateway/Telegram stale-socket restart guard: only apply stale-socket restarts to channels that publish event-liveness timestamps, preventing Telegram providers from being misclassified as stale solely due to long uptime and avoiding restart/pairing storms after upgrade. (openclaw#38464) - Onboarding/headless Linux daemon probe hardening: treat `systemctl --user is-enabled` probe failures as non-fatal during daemon install flow so onboarding no longer crashes on SSH/headless VPS environments before showing install guidance. (#37297) Thanks @acarbajal-web. - Memory/QMD mcporter Windows spawn hardening: when `mcporter.cmd` launch fails with `spawn EINVAL`, retry via bare `mcporter` shell resolution so QMD recall can continue instead of falling back to builtin memory search. (#27402) Thanks @i0ivi0i. - Tools/web_search Brave language-code validation: align `search_lang` handling with Brave-supported codes (including `zh-hans`, `zh-hant`, `en-gb`, and `pt-br`), map common alias inputs (`zh`, `ja`) to valid Brave values, and reject unsupported codes before upstream requests to prevent 422 failures. (#37260) Thanks @heyanming. diff --git a/src/discord/monitor/provider.lifecycle.ts b/src/discord/monitor/provider.lifecycle.ts index 6291d09a7b2..ffc78b40676 100644 --- a/src/discord/monitor/provider.lifecycle.ts +++ b/src/discord/monitor/provider.lifecycle.ts @@ -1,6 +1,7 @@ import type { Client } from "@buape/carbon"; import type { GatewayPlugin } from "@buape/carbon/gateway"; import { createArmableStallWatchdog } from "../../channels/transport/stall-watchdog.js"; +import { createConnectedChannelStatusPatch } from "../../gateway/channel-status-patches.js"; import { danger } from "../../globals.js"; import type { RuntimeEnv } from "../../runtime.js"; import { attachDiscordGatewayLogging } from "../gateway-logging.js"; @@ -180,8 +181,7 @@ export async function runDiscordGatewayLifecycle(params: { let sawConnected = gateway?.isConnected === true; if (sawConnected) { pushStatus({ - connected: true, - lastConnectedAt: at, + ...createConnectedChannelStatusPatch(at), lastDisconnect: null, }); } @@ -194,9 +194,7 @@ export async function runDiscordGatewayLifecycle(params: { const connectedAt = Date.now(); reconnectStallWatchdog.disarm(); pushStatus({ - connected: true, - lastEventAt: connectedAt, - lastConnectedAt: connectedAt, + ...createConnectedChannelStatusPatch(connectedAt), lastDisconnect: null, }); if (helloConnectedPollId) { @@ -253,9 +251,7 @@ export async function runDiscordGatewayLifecycle(params: { if (gateway?.isConnected && !lifecycleStopping) { const at = Date.now(); pushStatus({ - connected: true, - lastEventAt: at, - lastConnectedAt: at, + ...createConnectedChannelStatusPatch(at), lastDisconnect: null, }); } diff --git a/src/discord/monitor/provider.ts b/src/discord/monitor/provider.ts index fc24e6af1f5..c9f9f3d4b49 100644 --- a/src/discord/monitor/provider.ts +++ b/src/discord/monitor/provider.ts @@ -36,6 +36,7 @@ import { resolveDefaultGroupPolicy, warnMissingProviderGroupPolicyFallbackOnce, } from "../../config/runtime-group-policy.js"; +import { createConnectedChannelStatusPatch } from "../../gateway/channel-status-patches.js"; import { danger, logVerbose, shouldLogVerbose, warn } from "../../globals.js"; import { formatErrorMessage } from "../../infra/errors.js"; import { createDiscordRetryRunner } from "../../infra/retry-policy.js"; @@ -752,7 +753,7 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) { botUserId && botUserName ? `${botUserId} (${botUserName})` : (botUserId ?? botUserName ?? ""); runtime.log?.(`logged in to discord${botIdentity ? ` as ${botIdentity}` : ""}`); if (lifecycleGateway?.isConnected) { - opts.setStatus?.({ connected: true }); + opts.setStatus?.(createConnectedChannelStatusPatch()); } lifecycleStarted = true; diff --git a/src/gateway/channel-health-monitor.test.ts b/src/gateway/channel-health-monitor.test.ts index 3657dcb2c1e..6f7c8104874 100644 --- a/src/gateway/channel-health-monitor.test.ts +++ b/src/gateway/channel-health-monitor.test.ts @@ -489,16 +489,34 @@ describe("channel-health-monitor", () => { await expectNoRestart(manager); }); - it("restarts a channel that never received any event past the stale threshold", async () => { + it("restarts a channel that has seen no events since connect past the stale threshold", async () => { const now = Date.now(); const manager = createSlackSnapshotManager( runningConnectedSlackAccount({ lastStartAt: now - STALE_THRESHOLD - 60_000, + lastEventAt: now - STALE_THRESHOLD - 60_000, }), ); await expectRestartedChannel(manager, "slack"); }); + it("skips connected channels that do not report event liveness", async () => { + const now = Date.now(); + const manager = createSnapshotManager({ + telegram: { + default: { + running: true, + connected: true, + enabled: true, + configured: true, + lastStartAt: now - STALE_THRESHOLD - 60_000, + lastEventAt: null, + }, + }, + }); + await expectNoRestart(manager); + }); + it("respects custom staleEventThresholdMs", async () => { const customThreshold = 10 * 60_000; const now = Date.now(); diff --git a/src/gateway/channel-health-policy.test.ts b/src/gateway/channel-health-policy.test.ts index 125658ae53a..a4645a13e75 100644 --- a/src/gateway/channel-health-policy.test.ts +++ b/src/gateway/channel-health-policy.test.ts @@ -111,7 +111,7 @@ describe("evaluateChannelHealth", () => { enabled: true, configured: true, lastStartAt: 0, - lastEventAt: null, + lastEventAt: 0, }, { channelId: "discord", @@ -142,6 +142,65 @@ describe("evaluateChannelHealth", () => { ); expect(evaluation).toEqual({ healthy: true, reason: "healthy" }); }); + + it("does not flag stale sockets for channels without event tracking", () => { + const evaluation = evaluateChannelHealth( + { + running: true, + connected: true, + enabled: true, + configured: true, + lastStartAt: 0, + lastEventAt: null, + }, + { + channelId: "discord", + now: 100_000, + channelConnectGraceMs: 10_000, + staleEventThresholdMs: 30_000, + }, + ); + expect(evaluation).toEqual({ healthy: true, reason: "healthy" }); + }); + + it("does not flag stale sockets without an active connected socket", () => { + const evaluation = evaluateChannelHealth( + { + running: true, + enabled: true, + configured: true, + lastStartAt: 0, + lastEventAt: 0, + }, + { + channelId: "slack", + now: 100_000, + channelConnectGraceMs: 10_000, + staleEventThresholdMs: 30_000, + }, + ); + expect(evaluation).toEqual({ healthy: true, reason: "healthy" }); + }); + + it("ignores inherited event timestamps from a previous lifecycle", () => { + const evaluation = evaluateChannelHealth( + { + running: true, + connected: true, + enabled: true, + configured: true, + lastStartAt: 50_000, + lastEventAt: 10_000, + }, + { + channelId: "slack", + now: 100_000, + channelConnectGraceMs: 10_000, + staleEventThresholdMs: 30_000, + }, + ); + expect(evaluation).toEqual({ healthy: true, reason: "healthy" }); + }); }); describe("resolveChannelRestartReason", () => { diff --git a/src/gateway/channel-health-policy.ts b/src/gateway/channel-health-policy.ts index 80b0b3546ad..d8374d04ba8 100644 --- a/src/gateway/channel-health-policy.ts +++ b/src/gateway/channel-health-policy.ts @@ -103,17 +103,17 @@ export function evaluateChannelHealth( // Skip stale-socket check for Telegram (long-polling mode). Each polling request // acts as a heartbeat, so the half-dead WebSocket scenario this check is designed // to catch does not apply to Telegram's long-polling architecture. - if (policy.channelId !== "telegram") { - if (snapshot.lastEventAt != null || snapshot.lastStartAt != null) { - const upSince = snapshot.lastStartAt ?? 0; - const upDuration = policy.now - upSince; - if (upDuration > policy.staleEventThresholdMs) { - const lastEvent = snapshot.lastEventAt ?? 0; - const eventAge = policy.now - lastEvent; - if (eventAge > policy.staleEventThresholdMs) { - return { healthy: false, reason: "stale-socket" }; - } - } + if ( + policy.channelId !== "telegram" && + snapshot.connected === true && + snapshot.lastEventAt != null + ) { + if (lastStartAt != null && snapshot.lastEventAt < lastStartAt) { + return { healthy: true, reason: "healthy" }; + } + const eventAge = policy.now - snapshot.lastEventAt; + if (eventAge > policy.staleEventThresholdMs) { + return { healthy: false, reason: "stale-socket" }; } } return { healthy: true, reason: "healthy" }; diff --git a/src/gateway/channel-status-patches.test.ts b/src/gateway/channel-status-patches.test.ts new file mode 100644 index 00000000000..9297c23e69d --- /dev/null +++ b/src/gateway/channel-status-patches.test.ts @@ -0,0 +1,12 @@ +import { describe, expect, it } from "vitest"; +import { createConnectedChannelStatusPatch } from "./channel-status-patches.js"; + +describe("createConnectedChannelStatusPatch", () => { + it("uses one timestamp for connected event-liveness state", () => { + expect(createConnectedChannelStatusPatch(1234)).toEqual({ + connected: true, + lastConnectedAt: 1234, + lastEventAt: 1234, + }); + }); +}); diff --git a/src/gateway/channel-status-patches.ts b/src/gateway/channel-status-patches.ts new file mode 100644 index 00000000000..9e1af6a33d7 --- /dev/null +++ b/src/gateway/channel-status-patches.ts @@ -0,0 +1,15 @@ +export type ConnectedChannelStatusPatch = { + connected: true; + lastConnectedAt: number; + lastEventAt: number; +}; + +export function createConnectedChannelStatusPatch( + at: number = Date.now(), +): ConnectedChannelStatusPatch { + return { + connected: true, + lastConnectedAt: at, + lastEventAt: at, + }; +} diff --git a/src/slack/monitor/provider.reconnect.test.ts b/src/slack/monitor/provider.reconnect.test.ts index b3638a209bf..10fbab031a0 100644 --- a/src/slack/monitor/provider.reconnect.test.ts +++ b/src/slack/monitor/provider.reconnect.test.ts @@ -1,4 +1,4 @@ -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; import { __testing } from "./provider.js"; class FakeEmitter { @@ -22,6 +22,22 @@ class FakeEmitter { } describe("slack socket reconnect helpers", () => { + it("seeds event liveness when socket mode connects", () => { + const setStatus = vi.fn(); + + __testing.publishSlackConnectedStatus(setStatus); + + expect(setStatus).toHaveBeenCalledTimes(1); + expect(setStatus).toHaveBeenCalledWith( + expect.objectContaining({ + connected: true, + lastConnectedAt: expect.any(Number), + lastEventAt: expect.any(Number), + lastError: null, + }), + ); + }); + it("resolves disconnect waiter on socket disconnect event", async () => { const client = new FakeEmitter(); const app = { receiver: { client } }; diff --git a/src/slack/monitor/provider.ts b/src/slack/monitor/provider.ts index b7a10588e3f..12ba1020268 100644 --- a/src/slack/monitor/provider.ts +++ b/src/slack/monitor/provider.ts @@ -18,6 +18,7 @@ import { } from "../../config/runtime-group-policy.js"; import type { SessionScope } from "../../config/sessions.js"; import { normalizeResolvedSecretInputString } from "../../config/types.secrets.js"; +import { createConnectedChannelStatusPatch } from "../../gateway/channel-status-patches.js"; import { warn } from "../../globals.js"; import { computeBackoff, sleepWithAbort } from "../../infra/backoff.js"; import { installRequestBodyLimitGuard } from "../../infra/http-body.js"; @@ -65,6 +66,17 @@ function parseApiAppIdFromAppToken(raw?: string) { return match?.[1]?.toUpperCase(); } +function publishSlackConnectedStatus(setStatus?: (next: Record) => void) { + if (!setStatus) { + return; + } + const now = Date.now(); + setStatus({ + ...createConnectedChannelStatusPatch(now), + lastError: null, + }); +} + export async function monitorSlackProvider(opts: MonitorSlackOpts = {}) { const cfg = opts.config ?? loadConfig(); const runtime: RuntimeEnv = opts.runtime ?? createNonExitingRuntime(); @@ -390,6 +402,7 @@ export async function monitorSlackProvider(opts: MonitorSlackOpts = {}) { try { await app.start(); reconnectAttempts = 0; + publishSlackConnectedStatus(opts.setStatus); runtime.log?.("slack socket mode connected"); } catch (err) { // Auth errors (account_inactive, invalid_auth, etc.) are permanent — @@ -481,6 +494,7 @@ export async function monitorSlackProvider(opts: MonitorSlackOpts = {}) { export { isNonRecoverableSlackAuthError } from "./reconnect-policy.js"; export const __testing = { + publishSlackConnectedStatus, resolveSlackRuntimeGroupPolicy: resolveOpenProviderRuntimeGroupPolicy, resolveDefaultGroupPolicy, getSocketEmitter, diff --git a/src/web/auto-reply/monitor.ts b/src/web/auto-reply/monitor.ts index 66b9c0fd993..a9ef2f4b229 100644 --- a/src/web/auto-reply/monitor.ts +++ b/src/web/auto-reply/monitor.ts @@ -5,6 +5,7 @@ import { DEFAULT_GROUP_HISTORY_LIMIT } from "../../auto-reply/reply/history.js"; import { formatCliCommand } from "../../cli/command-format.js"; import { waitForever } from "../../cli/wait.js"; import { loadConfig } from "../../config/config.js"; +import { createConnectedChannelStatusPatch } from "../../gateway/channel-status-patches.js"; import { logVerbose } from "../../globals.js"; import { formatDurationPrecise } from "../../infra/format-time/format-duration.ts"; import { enqueueSystemEvent } from "../../infra/system-events.js"; @@ -210,9 +211,7 @@ export async function monitorWebChannel( }, }); - status.connected = true; - status.lastConnectedAt = Date.now(); - status.lastEventAt = status.lastConnectedAt; + Object.assign(status, createConnectedChannelStatusPatch()); status.lastError = null; emitStatus();