diff --git a/.secrets.baseline b/.secrets.baseline index 056b2dd8778..07641fb920b 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -12314,14 +12314,14 @@ "filename": "src/config/schema.help.ts", "hashed_secret": "9f4cda226d3868676ac7f86f59e4190eb94bd208", "is_verified": false, - "line_number": 653 + "line_number": 657 }, { "type": "Secret Keyword", "filename": "src/config/schema.help.ts", "hashed_secret": "01822c8bbf6a8b136944b14182cb885100ec2eae", "is_verified": false, - "line_number": 686 + "line_number": 690 } ], "src/config/schema.irc.ts": [ @@ -12360,14 +12360,14 @@ "filename": "src/config/schema.labels.ts", "hashed_secret": "e73c9fcad85cd4eecc74181ec4bdb31064d68439", "is_verified": false, - "line_number": 217 + "line_number": 219 }, { "type": "Secret Keyword", "filename": "src/config/schema.labels.ts", "hashed_secret": "2eda7cd978f39eebec3bf03e4410a40e14167fff", "is_verified": false, - "line_number": 326 + "line_number": 328 } ], "src/config/slack-http-config.test.ts": [ diff --git a/CHANGELOG.md b/CHANGELOG.md index de21281fbde..9df61dd75a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Docs: https://docs.openclaw.ai - Refactor/channels: remove the legacy channel shim directories and point channel-specific imports directly at the extension-owned implementations. (#45967) thanks @scoootscooob. - Feishu/streaming: add `onReasoningStream` and `onReasoningEnd` support to streaming cards, so `/reasoning stream` renders thinking tokens as markdown blockquotes in the same card — matching the Telegram channel's reasoning lane behavior. - Feishu/cards: add identity-aware structured card headers and note footers for Feishu replies and direct sends, while keeping that presentation wired through the shared outbound identity path. (#29938) Thanks @nszhsl. +- Gateway/health monitor: add configurable stale-event thresholds and restart limits, plus per-channel and per-account `healthMonitor.enabled` overrides, while keeping the existing global disable path on `gateway.channelHealthCheckMinutes=0`. (#42107) Thanks @rstar327. ### Fixes diff --git a/extensions/bluebubbles/src/types.ts b/extensions/bluebubbles/src/types.ts index 43e8c739775..11a1d486652 100644 --- a/extensions/bluebubbles/src/types.ts +++ b/extensions/bluebubbles/src/types.ts @@ -57,6 +57,10 @@ export type BlueBubblesAccountConfig = { allowPrivateNetwork?: boolean; /** Per-group configuration keyed by chat GUID or identifier. */ groups?: Record; + /** Channel health monitor overrides for this channel/account. */ + healthMonitor?: { + enabled?: boolean; + }; }; export type BlueBubblesActionConfig = { diff --git a/src/config/config-misc.test.ts b/src/config/config-misc.test.ts index bd9a05fea10..177711dcc03 100644 --- a/src/config/config-misc.test.ts +++ b/src/config/config-misc.test.ts @@ -212,6 +212,49 @@ describe("gateway.channelHealthCheckMinutes", () => { expect(res.issues[0]?.path).toBe("gateway.channelHealthCheckMinutes"); } }); + + it("rejects stale thresholds shorter than the health check interval", () => { + const res = validateConfigObject({ + gateway: { + channelHealthCheckMinutes: 5, + channelStaleEventThresholdMinutes: 4, + }, + }); + expect(res.ok).toBe(false); + if (!res.ok) { + expect(res.issues[0]?.path).toBe("gateway.channelStaleEventThresholdMinutes"); + } + }); + + it("accepts stale thresholds that match or exceed the health check interval", () => { + const equal = validateConfigObject({ + gateway: { + channelHealthCheckMinutes: 5, + channelStaleEventThresholdMinutes: 5, + }, + }); + expect(equal.ok).toBe(true); + + const greater = validateConfigObject({ + gateway: { + channelHealthCheckMinutes: 5, + channelStaleEventThresholdMinutes: 6, + }, + }); + expect(greater.ok).toBe(true); + }); + + it("rejects stale thresholds shorter than the default health check interval", () => { + const res = validateConfigObject({ + gateway: { + channelStaleEventThresholdMinutes: 4, + }, + }); + expect(res.ok).toBe(false); + if (!res.ok) { + expect(res.issues[0]?.path).toBe("gateway.channelStaleEventThresholdMinutes"); + } + }); }); describe("cron webhook schema", () => { diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 555ee02b8eb..7fbfdec76d8 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -102,6 +102,10 @@ export const FIELD_HELP: Record = { "Explicit gateway-level tool denylist to block risky tools even if lower-level policies allow them. Use deny rules for emergency response and defense-in-depth hardening.", "gateway.channelHealthCheckMinutes": "Interval in minutes for automatic channel health probing and status updates. Use lower intervals for faster detection, or higher intervals to reduce periodic probe noise.", + "gateway.channelStaleEventThresholdMinutes": + "How many minutes a connected channel can go without receiving any event before the health monitor treats it as a stale socket and triggers a restart. Default: 30.", + "gateway.channelMaxRestartsPerHour": + "Maximum number of health-monitor-initiated channel restarts allowed within a rolling one-hour window. Once hit, further restarts are skipped until the window expires. Default: 10.", "gateway.tailscale": "Tailscale integration settings for Serve/Funnel exposure and lifecycle handling on gateway start/exit. Keep off unless your deployment intentionally relies on Tailscale ingress.", "gateway.tailscale.mode": diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index 9b1fdb73445..e700f2329b4 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -84,6 +84,8 @@ export const FIELD_LABELS: Record = { "gateway.tools.allow": "Gateway Tool Allowlist", "gateway.tools.deny": "Gateway Tool Denylist", "gateway.channelHealthCheckMinutes": "Gateway Channel Health Check Interval (min)", + "gateway.channelStaleEventThresholdMinutes": "Gateway Channel Stale Event Threshold (min)", + "gateway.channelMaxRestartsPerHour": "Gateway Channel Max Restarts Per Hour", "gateway.tailscale": "Gateway Tailscale", "gateway.tailscale.mode": "Gateway Tailscale Mode", "gateway.tailscale.resetOnExit": "Gateway Tailscale Reset on Exit", diff --git a/src/config/types.channel-messaging-common.ts b/src/config/types.channel-messaging-common.ts index 5d927884bd6..f918557aad6 100644 --- a/src/config/types.channel-messaging-common.ts +++ b/src/config/types.channel-messaging-common.ts @@ -4,7 +4,10 @@ import type { GroupPolicy, MarkdownConfig, } from "./types.base.js"; -import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js"; +import type { + ChannelHealthMonitorConfig, + ChannelHeartbeatVisibilityConfig, +} from "./types.channels.js"; import type { DmConfig } from "./types.messages.js"; export type CommonChannelMessagingConfig = { @@ -43,6 +46,8 @@ export type CommonChannelMessagingConfig = { blockStreamingCoalesce?: BlockStreamingCoalesceConfig; /** Heartbeat visibility settings for this channel. */ heartbeat?: ChannelHeartbeatVisibilityConfig; + /** Channel health monitor overrides for this channel/account. */ + healthMonitor?: ChannelHealthMonitorConfig; /** Outbound response prefix override for this channel/account. */ responsePrefix?: string; /** Max outbound media size in MB. */ diff --git a/src/config/types.channels.ts b/src/config/types.channels.ts index caa33631bb1..96d8efddac6 100644 --- a/src/config/types.channels.ts +++ b/src/config/types.channels.ts @@ -18,6 +18,14 @@ export type ChannelHeartbeatVisibilityConfig = { useIndicator?: boolean; }; +export type ChannelHealthMonitorConfig = { + /** + * Enable channel-health-monitor restarts for this channel or account. + * Inherits the global gateway setting when omitted. + */ + enabled?: boolean; +}; + export type ChannelDefaultsConfig = { groupPolicy?: GroupPolicy; /** Default heartbeat visibility for all channels. */ @@ -39,6 +47,7 @@ export type ExtensionChannelConfig = { defaultAccount?: string; dmPolicy?: string; groupPolicy?: GroupPolicy; + healthMonitor?: ChannelHealthMonitorConfig; accounts?: Record; [key: string]: unknown; }; diff --git a/src/config/types.discord.ts b/src/config/types.discord.ts index e25f7c5f592..a27fd3f8b45 100644 --- a/src/config/types.discord.ts +++ b/src/config/types.discord.ts @@ -8,7 +8,10 @@ import type { OutboundRetryConfig, ReplyToMode, } from "./types.base.js"; -import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js"; +import type { + ChannelHealthMonitorConfig, + ChannelHeartbeatVisibilityConfig, +} from "./types.channels.js"; import type { DmConfig, ProviderCommandsConfig } from "./types.messages.js"; import type { SecretInput } from "./types.secrets.js"; import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js"; @@ -297,6 +300,8 @@ export type DiscordAccountConfig = { guilds?: Record; /** Heartbeat visibility settings for this channel. */ heartbeat?: ChannelHeartbeatVisibilityConfig; + /** Channel health monitor overrides for this channel/account. */ + healthMonitor?: ChannelHealthMonitorConfig; /** Exec approval forwarding configuration. */ execApprovals?: DiscordExecApprovalConfig; /** Agent-controlled interactive components (buttons, select menus). */ diff --git a/src/config/types.gateway.ts b/src/config/types.gateway.ts index ea17a1d9d05..88a5350ab1d 100644 --- a/src/config/types.gateway.ts +++ b/src/config/types.gateway.ts @@ -431,4 +431,16 @@ export type GatewayConfig = { * Set to 0 to disable. Default: 5. */ channelHealthCheckMinutes?: number; + /** + * Stale event threshold in minutes for the channel health monitor. + * A connected channel that receives no events for this duration is treated + * as a stale socket and restarted. Default: 30. + */ + channelStaleEventThresholdMinutes?: number; + /** + * Maximum number of health-monitor-initiated channel restarts per hour. + * Once this limit is reached, the monitor skips further restarts until + * the rolling window expires. Default: 10. + */ + channelMaxRestartsPerHour?: number; }; diff --git a/src/config/types.googlechat.ts b/src/config/types.googlechat.ts index 091c4f0f271..fdfc23fd866 100644 --- a/src/config/types.googlechat.ts +++ b/src/config/types.googlechat.ts @@ -4,6 +4,7 @@ import type { GroupPolicy, ReplyToMode, } from "./types.base.js"; +import type { ChannelHealthMonitorConfig } from "./types.channels.js"; import type { DmConfig } from "./types.messages.js"; import type { SecretRef } from "./types.secrets.js"; @@ -99,6 +100,8 @@ export type GoogleChatAccountConfig = { /** Per-action tool gating (default: true for all). */ actions?: GoogleChatActionConfig; dm?: GoogleChatDmConfig; + /** Channel health monitor overrides for this channel/account. */ + healthMonitor?: ChannelHealthMonitorConfig; /** * Typing indicator mode (default: "message"). * - "none": No indicator diff --git a/src/config/types.imessage.ts b/src/config/types.imessage.ts index 9fe1b96fef2..4d63965586b 100644 --- a/src/config/types.imessage.ts +++ b/src/config/types.imessage.ts @@ -4,7 +4,10 @@ import type { GroupPolicy, MarkdownConfig, } from "./types.base.js"; -import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js"; +import type { + ChannelHealthMonitorConfig, + ChannelHeartbeatVisibilityConfig, +} from "./types.channels.js"; import type { DmConfig } from "./types.messages.js"; import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js"; @@ -77,6 +80,8 @@ export type IMessageAccountConfig = { >; /** Heartbeat visibility settings for this channel. */ heartbeat?: ChannelHeartbeatVisibilityConfig; + /** Channel health monitor overrides for this channel/account. */ + healthMonitor?: ChannelHealthMonitorConfig; /** Outbound response prefix override for this channel/account. */ responsePrefix?: string; }; diff --git a/src/config/types.msteams.ts b/src/config/types.msteams.ts index 35470a56178..83195f03a40 100644 --- a/src/config/types.msteams.ts +++ b/src/config/types.msteams.ts @@ -4,7 +4,10 @@ import type { GroupPolicy, MarkdownConfig, } from "./types.base.js"; -import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js"; +import type { + ChannelHealthMonitorConfig, + ChannelHeartbeatVisibilityConfig, +} from "./types.channels.js"; import type { DmConfig } from "./types.messages.js"; import type { SecretInput } from "./types.secrets.js"; import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js"; @@ -114,6 +117,8 @@ export type MSTeamsConfig = { sharePointSiteId?: string; /** Heartbeat visibility settings for this channel. */ heartbeat?: ChannelHeartbeatVisibilityConfig; + /** Channel health monitor overrides for this channel. */ + healthMonitor?: ChannelHealthMonitorConfig; /** Outbound response prefix override for this channel/account. */ responsePrefix?: string; }; diff --git a/src/config/types.slack.ts b/src/config/types.slack.ts index a90f1ed5020..c62e3b03e64 100644 --- a/src/config/types.slack.ts +++ b/src/config/types.slack.ts @@ -5,7 +5,10 @@ import type { MarkdownConfig, ReplyToMode, } from "./types.base.js"; -import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js"; +import type { + ChannelHealthMonitorConfig, + ChannelHeartbeatVisibilityConfig, +} from "./types.channels.js"; import type { DmConfig, ProviderCommandsConfig } from "./types.messages.js"; import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js"; @@ -185,6 +188,8 @@ export type SlackAccountConfig = { channels?: Record; /** Heartbeat visibility settings for this channel. */ heartbeat?: ChannelHeartbeatVisibilityConfig; + /** Channel health monitor overrides for this channel/account. */ + healthMonitor?: ChannelHealthMonitorConfig; /** Outbound response prefix override for this channel/account. */ responsePrefix?: string; /** diff --git a/src/config/types.telegram.ts b/src/config/types.telegram.ts index 45eac2fb310..252f66740b2 100644 --- a/src/config/types.telegram.ts +++ b/src/config/types.telegram.ts @@ -8,7 +8,10 @@ import type { ReplyToMode, SessionThreadBindingsConfig, } from "./types.base.js"; -import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js"; +import type { + ChannelHealthMonitorConfig, + ChannelHeartbeatVisibilityConfig, +} from "./types.channels.js"; import type { DmConfig, ProviderCommandsConfig } from "./types.messages.js"; import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js"; @@ -179,6 +182,8 @@ export type TelegramAccountConfig = { reactionLevel?: "off" | "ack" | "minimal" | "extensive"; /** Heartbeat visibility settings for this channel. */ heartbeat?: ChannelHeartbeatVisibilityConfig; + /** Channel health monitor overrides for this channel/account. */ + healthMonitor?: ChannelHealthMonitorConfig; /** Controls whether link previews are shown in outbound messages. Default: true. */ linkPreview?: boolean; /** diff --git a/src/config/types.whatsapp.ts b/src/config/types.whatsapp.ts index a39a5c28e1f..29ae866956a 100644 --- a/src/config/types.whatsapp.ts +++ b/src/config/types.whatsapp.ts @@ -4,7 +4,10 @@ import type { GroupPolicy, MarkdownConfig, } from "./types.base.js"; -import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js"; +import type { + ChannelHealthMonitorConfig, + ChannelHeartbeatVisibilityConfig, +} from "./types.channels.js"; import type { DmConfig } from "./types.messages.js"; import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js"; @@ -78,6 +81,8 @@ type WhatsAppSharedConfig = { debounceMs?: number; /** Heartbeat visibility settings. */ heartbeat?: ChannelHeartbeatVisibilityConfig; + /** Channel health monitor overrides for this channel/account. */ + healthMonitor?: ChannelHealthMonitorConfig; }; type WhatsAppConfigCore = { diff --git a/src/config/zod-schema.channels.ts b/src/config/zod-schema.channels.ts index ebabe1bae94..94d6d24caed 100644 --- a/src/config/zod-schema.channels.ts +++ b/src/config/zod-schema.channels.ts @@ -8,3 +8,10 @@ export const ChannelHeartbeatVisibilitySchema = z }) .strict() .optional(); + +export const ChannelHealthMonitorSchema = z + .object({ + enabled: z.boolean().optional(), + }) + .strict() + .optional(); diff --git a/src/config/zod-schema.providers-core.ts b/src/config/zod-schema.providers-core.ts index ced89bd8512..e6e4a3aacd2 100644 --- a/src/config/zod-schema.providers-core.ts +++ b/src/config/zod-schema.providers-core.ts @@ -13,7 +13,10 @@ import { resolveTelegramCustomCommands, } from "./telegram-custom-commands.js"; import { ToolPolicySchema } from "./zod-schema.agent-runtime.js"; -import { ChannelHeartbeatVisibilitySchema } from "./zod-schema.channels.js"; +import { + ChannelHealthMonitorSchema, + ChannelHeartbeatVisibilitySchema, +} from "./zod-schema.channels.js"; import { BlockStreamingChunkSchema, BlockStreamingCoalesceSchema, @@ -271,6 +274,7 @@ export const TelegramAccountSchemaBase = z reactionNotifications: z.enum(["off", "own", "all"]).optional(), reactionLevel: z.enum(["off", "ack", "minimal", "extensive"]).optional(), heartbeat: ChannelHeartbeatVisibilitySchema, + healthMonitor: ChannelHealthMonitorSchema, linkPreview: z.boolean().optional(), responsePrefix: z.string().optional(), ackReaction: z.string().optional(), @@ -511,6 +515,7 @@ export const DiscordAccountSchema = z dm: DiscordDmSchema.optional(), guilds: z.record(z.string(), DiscordGuildSchema.optional()).optional(), heartbeat: ChannelHeartbeatVisibilitySchema, + healthMonitor: ChannelHealthMonitorSchema, execApprovals: z .object({ enabled: z.boolean().optional(), @@ -782,6 +787,7 @@ export const GoogleChatAccountSchema = z .strict() .optional(), dm: GoogleChatDmSchema.optional(), + healthMonitor: ChannelHealthMonitorSchema, typingIndicator: z.enum(["none", "message", "reaction"]).optional(), responsePrefix: z.string().optional(), }) @@ -898,6 +904,7 @@ export const SlackAccountSchema = z dm: SlackDmSchema.optional(), channels: z.record(z.string(), SlackChannelSchema.optional()).optional(), heartbeat: ChannelHeartbeatVisibilitySchema, + healthMonitor: ChannelHealthMonitorSchema, responsePrefix: z.string().optional(), ackReaction: z.string().optional(), typingReaction: z.string().optional(), @@ -1032,6 +1039,7 @@ export const SignalAccountSchemaBase = z .optional(), reactionLevel: z.enum(["off", "ack", "minimal", "extensive"]).optional(), heartbeat: ChannelHeartbeatVisibilitySchema, + healthMonitor: ChannelHealthMonitorSchema, responsePrefix: z.string().optional(), }) .strict(); @@ -1145,6 +1153,7 @@ export const IrcAccountSchemaBase = z blockStreamingCoalesce: BlockStreamingCoalesceSchema.optional(), mediaMaxMb: z.number().positive().optional(), heartbeat: ChannelHeartbeatVisibilitySchema, + healthMonitor: ChannelHealthMonitorSchema, responsePrefix: z.string().optional(), }) .strict(); @@ -1272,6 +1281,7 @@ export const IMessageAccountSchemaBase = z ) .optional(), heartbeat: ChannelHeartbeatVisibilitySchema, + healthMonitor: ChannelHealthMonitorSchema, responsePrefix: z.string().optional(), }) .strict(); @@ -1383,6 +1393,7 @@ export const BlueBubblesAccountSchemaBase = z blockStreamingCoalesce: BlockStreamingCoalesceSchema.optional(), groups: z.record(z.string(), BlueBubblesGroupConfigSchema.optional()).optional(), heartbeat: ChannelHeartbeatVisibilitySchema, + healthMonitor: ChannelHealthMonitorSchema, responsePrefix: z.string().optional(), }) .strict(); @@ -1499,6 +1510,7 @@ export const MSTeamsConfigSchema = z /** SharePoint site ID for file uploads in group chats/channels (e.g., "contoso.sharepoint.com,guid1,guid2") */ sharePointSiteId: z.string().optional(), heartbeat: ChannelHeartbeatVisibilitySchema, + healthMonitor: ChannelHealthMonitorSchema, responsePrefix: z.string().optional(), }) .strict() diff --git a/src/config/zod-schema.providers-whatsapp.ts b/src/config/zod-schema.providers-whatsapp.ts index 2faba715bad..26b7c476c53 100644 --- a/src/config/zod-schema.providers-whatsapp.ts +++ b/src/config/zod-schema.providers-whatsapp.ts @@ -1,6 +1,9 @@ import { z } from "zod"; import { ToolPolicySchema } from "./zod-schema.agent-runtime.js"; -import { ChannelHeartbeatVisibilitySchema } from "./zod-schema.channels.js"; +import { + ChannelHealthMonitorSchema, + ChannelHeartbeatVisibilitySchema, +} from "./zod-schema.channels.js"; import { BlockStreamingCoalesceSchema, DmConfigSchema, @@ -56,6 +59,7 @@ const WhatsAppSharedSchema = z.object({ ackReaction: WhatsAppAckReactionSchema, debounceMs: z.number().int().nonnegative().optional().default(0), heartbeat: ChannelHeartbeatVisibilitySchema, + healthMonitor: ChannelHealthMonitorSchema, }); function enforceOpenDmPolicyAllowFromStar(params: { diff --git a/src/config/zod-schema.ts b/src/config/zod-schema.ts index 8c78d049d0e..20b8b232157 100644 --- a/src/config/zod-schema.ts +++ b/src/config/zod-schema.ts @@ -696,6 +696,8 @@ export const OpenClawSchema = z .strict() .optional(), channelHealthCheckMinutes: z.number().int().min(0).optional(), + channelStaleEventThresholdMinutes: z.number().int().min(1).optional(), + channelMaxRestartsPerHour: z.number().int().min(1).optional(), tailscale: z .object({ mode: z.union([z.literal("off"), z.literal("serve"), z.literal("funnel")]).optional(), @@ -833,6 +835,21 @@ export const OpenClawSchema = z .optional(), }) .strict() + .superRefine((gateway, ctx) => { + const effectiveHealthCheckMinutes = gateway.channelHealthCheckMinutes ?? 5; + if ( + gateway.channelStaleEventThresholdMinutes != null && + effectiveHealthCheckMinutes !== 0 && + gateway.channelStaleEventThresholdMinutes < effectiveHealthCheckMinutes + ) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + path: ["channelStaleEventThresholdMinutes"], + message: + "channelStaleEventThresholdMinutes should be >= channelHealthCheckMinutes to avoid delayed stale detection", + }); + } + }) .optional(), memory: MemorySchema, skills: z diff --git a/src/gateway/channel-health-monitor.test.ts b/src/gateway/channel-health-monitor.test.ts index 32052af5745..efc392f8ee0 100644 --- a/src/gateway/channel-health-monitor.test.ts +++ b/src/gateway/channel-health-monitor.test.ts @@ -11,6 +11,7 @@ function createMockChannelManager(overrides?: Partial): ChannelM startChannel: vi.fn(async () => {}), stopChannel: vi.fn(async () => {}), markChannelLoggedOut: vi.fn(), + isHealthMonitorEnabled: vi.fn(() => true), isManuallyStopped: vi.fn(() => false), resetRestartAttempts: vi.fn(), ...overrides, @@ -226,6 +227,53 @@ describe("channel-health-monitor", () => { await expectNoStart(manager); }); + it("skips channels with health monitor disabled globally for that account", async () => { + const manager = createSnapshotManager( + { + discord: { + default: { running: false, enabled: true, configured: true }, + }, + }, + { isHealthMonitorEnabled: vi.fn(() => false) }, + ); + await expectNoStart(manager); + }); + + it("still restarts enabled accounts when another account on the same channel is disabled", async () => { + const now = Date.now(); + const manager = createSnapshotManager( + { + discord: { + default: { + running: true, + connected: false, + enabled: true, + configured: true, + lastStartAt: now - 300_000, + }, + quiet: { + running: true, + connected: false, + enabled: true, + configured: true, + lastStartAt: now - 300_000, + }, + }, + }, + { + isHealthMonitorEnabled: vi.fn((channelId: ChannelId, accountId: string) => { + return !(channelId === "discord" && accountId === "quiet"); + }), + }, + ); + const monitor = await startAndRunCheck(manager); + expect(manager.stopChannel).toHaveBeenCalledWith("discord", "default"); + expect(manager.startChannel).toHaveBeenCalledWith("discord", "default"); + expect(manager.stopChannel).not.toHaveBeenCalledWith("discord", "quiet"); + expect(manager.startChannel).not.toHaveBeenCalledWith("discord", "quiet"); + monitor.stop(); + }); + it("restarts a stuck channel (running but not connected)", async () => { const now = Date.now(); const manager = createSnapshotManager({ diff --git a/src/gateway/channel-health-monitor.ts b/src/gateway/channel-health-monitor.ts index fb8715a12f1..809beb1abb8 100644 --- a/src/gateway/channel-health-monitor.ts +++ b/src/gateway/channel-health-monitor.ts @@ -118,6 +118,9 @@ export function startChannelHealthMonitor(deps: ChannelHealthMonitorDeps): Chann if (!status) { continue; } + if (!channelManager.isHealthMonitorEnabled(channelId as ChannelId, accountId)) { + continue; + } if (channelManager.isManuallyStopped(channelId as ChannelId, accountId)) { continue; } diff --git a/src/gateway/config-reload-plan.ts b/src/gateway/config-reload-plan.ts index 4ca1fcea7f0..63eddd31c54 100644 --- a/src/gateway/config-reload-plan.ts +++ b/src/gateway/config-reload-plan.ts @@ -41,6 +41,16 @@ const BASE_RELOAD_RULES: ReloadRule[] = [ kind: "hot", actions: ["restart-health-monitor"], }, + { + prefix: "gateway.channelStaleEventThresholdMinutes", + kind: "hot", + actions: ["restart-health-monitor"], + }, + { + prefix: "gateway.channelMaxRestartsPerHour", + kind: "hot", + actions: ["restart-health-monitor"], + }, // Stuck-session warning threshold is read by the diagnostics heartbeat loop. { prefix: "diagnostics.stuckSessionWarnMs", kind: "none" }, { prefix: "hooks.gmail", kind: "hot", actions: ["restart-gmail-watcher"] }, diff --git a/src/gateway/server-channels.test.ts b/src/gateway/server-channels.test.ts index c442c142417..b6e8f556123 100644 --- a/src/gateway/server-channels.test.ts +++ b/src/gateway/server-channels.test.ts @@ -44,12 +44,13 @@ function createTestPlugin(params?: { account?: TestAccount; startAccount?: NonNullable["gateway"]>["startAccount"]; includeDescribeAccount?: boolean; + resolveAccount?: ChannelPlugin["config"]["resolveAccount"]; }): ChannelPlugin { const account = params?.account ?? { enabled: true, configured: true }; const includeDescribeAccount = params?.includeDescribeAccount !== false; const config: ChannelPlugin["config"] = { listAccountIds: () => [DEFAULT_ACCOUNT_ID], - resolveAccount: () => account, + resolveAccount: params?.resolveAccount ?? (() => account), isEnabled: (resolved) => resolved.enabled !== false, }; if (includeDescribeAccount) { @@ -88,13 +89,16 @@ function installTestRegistry(plugin: ChannelPlugin) { setActivePluginRegistry(registry); } -function createManager(options?: { channelRuntime?: PluginRuntime["channel"] }) { +function createManager(options?: { + channelRuntime?: PluginRuntime["channel"]; + loadConfig?: () => Record; +}) { const log = createSubsystemLogger("gateway/server-channels-test"); const channelLogs = { discord: log } as Record; const runtime = runtimeForLogger(log); const channelRuntimeEnvs = { discord: runtime } as Record; return createChannelManager({ - loadConfig: () => ({}), + loadConfig: () => options?.loadConfig?.() ?? {}, channelLogs, channelRuntimeEnvs, ...(options?.channelRuntime ? { channelRuntime: options.channelRuntime } : {}), @@ -180,4 +184,104 @@ describe("server-channels auto restart", () => { await manager.startChannels(); expect(startAccount).toHaveBeenCalledTimes(1); }); + + it("reuses plugin account resolution for health monitor overrides", () => { + installTestRegistry( + createTestPlugin({ + resolveAccount: (cfg, accountId) => { + const accounts = ( + cfg as { + channels?: { + discord?: { + accounts?: Record< + string, + TestAccount & { healthMonitor?: { enabled?: boolean } } + >; + }; + }; + } + ).channels?.discord?.accounts; + if (!accounts) { + return { enabled: true, configured: true }; + } + const direct = accounts[accountId ?? DEFAULT_ACCOUNT_ID]; + if (direct) { + return direct; + } + const normalized = (accountId ?? DEFAULT_ACCOUNT_ID).toLowerCase().replaceAll(" ", "-"); + const matchKey = Object.keys(accounts).find( + (key) => key.toLowerCase().replaceAll(" ", "-") === normalized, + ); + return matchKey ? (accounts[matchKey] ?? { enabled: true, configured: true }) : {}; + }, + }), + ); + + const manager = createManager({ + loadConfig: () => ({ + channels: { + discord: { + accounts: { + "Router D": { + enabled: true, + configured: true, + healthMonitor: { enabled: false }, + }, + }, + }, + }, + }), + }); + + expect(manager.isHealthMonitorEnabled("discord", "router-d")).toBe(false); + }); + + it("falls back to channel-level health monitor overrides when account resolution omits them", () => { + installTestRegistry( + createTestPlugin({ + resolveAccount: () => ({ + enabled: true, + configured: true, + }), + }), + ); + + const manager = createManager({ + loadConfig: () => ({ + channels: { + discord: { + healthMonitor: { enabled: false }, + }, + }, + }), + }); + + expect(manager.isHealthMonitorEnabled("discord", DEFAULT_ACCOUNT_ID)).toBe(false); + }); + + it("uses wrapped account config health monitor overrides", () => { + installTestRegistry( + createTestPlugin({ + resolveAccount: () => ({ + enabled: true, + configured: true, + config: { + healthMonitor: { enabled: false }, + }, + }), + }), + ); + + const manager = createManager({ + loadConfig: () => ({ + channels: { + discord: { + healthMonitor: { enabled: true }, + }, + }, + }), + }); + + expect(manager.isHealthMonitorEnabled("discord", DEFAULT_ACCOUNT_ID)).toBe(false); + }); }); diff --git a/src/gateway/server-channels.ts b/src/gateway/server-channels.ts index 4090791d285..5595b946884 100644 --- a/src/gateway/server-channels.ts +++ b/src/gateway/server-channels.ts @@ -105,6 +105,7 @@ export type ChannelManager = { markChannelLoggedOut: (channelId: ChannelId, cleared: boolean, accountId?: string) => void; isManuallyStopped: (channelId: ChannelId, accountId: string) => boolean; resetRestartAttempts: (channelId: ChannelId, accountId: string) => void; + isHealthMonitorEnabled: (channelId: ChannelId, accountId: string) => boolean; }; // Channel docking: lifecycle hooks (`plugin.gateway`) flow through this manager. @@ -119,6 +120,48 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage const restartKey = (channelId: ChannelId, accountId: string) => `${channelId}:${accountId}`; + const isHealthMonitorEnabled = (channelId: ChannelId, accountId: string): boolean => { + const cfg = loadConfig(); + const plugin = getChannelPlugin(channelId); + const resolvedAccount = plugin?.config.resolveAccount(cfg, accountId) as + | { + healthMonitor?: { + enabled?: boolean; + }; + config?: { + healthMonitor?: { + enabled?: boolean; + }; + }; + } + | undefined; + const accountOverride = resolvedAccount?.healthMonitor?.enabled; + const wrappedAccountOverride = resolvedAccount?.config?.healthMonitor?.enabled; + const channelOverride = ( + cfg.channels?.[channelId] as + | { + healthMonitor?: { + enabled?: boolean; + }; + } + | undefined + )?.healthMonitor?.enabled; + + if (typeof accountOverride === "boolean") { + return accountOverride; + } + + if (typeof wrappedAccountOverride === "boolean") { + return wrappedAccountOverride; + } + + if (typeof channelOverride === "boolean") { + return channelOverride; + } + + return true; + }; + const getStore = (channelId: ChannelId): ChannelRuntimeStore => { const existing = channelStores.get(channelId); if (existing) { @@ -453,5 +496,6 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage markChannelLoggedOut, isManuallyStopped: isManuallyStopped_, resetRestartAttempts: resetRestartAttempts_, + isHealthMonitorEnabled, }; } diff --git a/src/gateway/server-reload-handlers.ts b/src/gateway/server-reload-handlers.ts index f9cfb9111fe..008f0977d37 100644 --- a/src/gateway/server-reload-handlers.ts +++ b/src/gateway/server-reload-handlers.ts @@ -50,7 +50,11 @@ export function createGatewayReloadHandlers(params: { logChannels: { info: (msg: string) => void; error: (msg: string) => void }; logCron: { error: (msg: string) => void }; logReload: { info: (msg: string) => void; warn: (msg: string) => void }; - createHealthMonitor: (checkIntervalMs: number) => ChannelHealthMonitor; + createHealthMonitor: (opts: { + checkIntervalMs: number; + staleEventThresholdMs?: number; + maxRestartsPerHour?: number; + }) => ChannelHealthMonitor; }) { const applyHotReload = async ( plan: GatewayReloadPlan, @@ -101,8 +105,17 @@ export function createGatewayReloadHandlers(params: { if (plan.restartHealthMonitor) { state.channelHealthMonitor?.stop(); const minutes = nextConfig.gateway?.channelHealthCheckMinutes; + const staleMinutes = nextConfig.gateway?.channelStaleEventThresholdMinutes; nextState.channelHealthMonitor = - minutes === 0 ? null : params.createHealthMonitor((minutes ?? 5) * 60_000); + minutes === 0 + ? null + : params.createHealthMonitor({ + checkIntervalMs: (minutes ?? 5) * 60_000, + ...(staleMinutes != null && { staleEventThresholdMs: staleMinutes * 60_000 }), + ...(nextConfig.gateway?.channelMaxRestartsPerHour != null && { + maxRestartsPerHour: nextConfig.gateway.channelMaxRestartsPerHour, + }), + }); } if (plan.restartGmailWatcher) { diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index 9b3941d1432..5453ff8fcee 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -757,11 +757,17 @@ export async function startGatewayServer( const healthCheckMinutes = cfgAtStart.gateway?.channelHealthCheckMinutes; const healthCheckDisabled = healthCheckMinutes === 0; + const staleEventThresholdMinutes = cfgAtStart.gateway?.channelStaleEventThresholdMinutes; + const maxRestartsPerHour = cfgAtStart.gateway?.channelMaxRestartsPerHour; let channelHealthMonitor = healthCheckDisabled ? null : startChannelHealthMonitor({ channelManager, checkIntervalMs: (healthCheckMinutes ?? 5) * 60_000, + ...(staleEventThresholdMinutes != null && { + staleEventThresholdMs: staleEventThresholdMinutes * 60_000, + }), + ...(maxRestartsPerHour != null && { maxRestartsPerHour }), }); if (!minimalTestGateway) { @@ -980,8 +986,21 @@ export async function startGatewayServer( logChannels, logCron, logReload, - createHealthMonitor: (checkIntervalMs: number) => - startChannelHealthMonitor({ channelManager, checkIntervalMs }), + createHealthMonitor: (opts: { + checkIntervalMs: number; + staleEventThresholdMs?: number; + maxRestartsPerHour?: number; + }) => + startChannelHealthMonitor({ + channelManager, + checkIntervalMs: opts.checkIntervalMs, + ...(opts.staleEventThresholdMs != null && { + staleEventThresholdMs: opts.staleEventThresholdMs, + }), + ...(opts.maxRestartsPerHour != null && { + maxRestartsPerHour: opts.maxRestartsPerHour, + }), + }), }); return startGatewayConfigReloader({ diff --git a/src/gateway/server.reload.test.ts b/src/gateway/server.reload.test.ts index da749fc6501..e16dcd3f35c 100644 --- a/src/gateway/server.reload.test.ts +++ b/src/gateway/server.reload.test.ts @@ -109,6 +109,9 @@ const hoisted = vi.hoisted(() => { startChannel: vi.fn(async () => {}), stopChannel: vi.fn(async () => {}), markChannelLoggedOut: vi.fn(), + isHealthMonitorEnabled: vi.fn(() => true), + isManuallyStopped: vi.fn(() => false), + resetRestartAttempts: vi.fn(), }; const createChannelManager = vi.fn(() => providerManager); diff --git a/src/gateway/server/readiness.test.ts b/src/gateway/server/readiness.test.ts index b333277f158..f41373dab7e 100644 --- a/src/gateway/server/readiness.test.ts +++ b/src/gateway/server/readiness.test.ts @@ -26,6 +26,7 @@ function createManager(snapshot: ChannelRuntimeSnapshot): ChannelManager { startChannel: vi.fn(), stopChannel: vi.fn(), markChannelLoggedOut: vi.fn(), + isHealthMonitorEnabled: vi.fn(() => true), isManuallyStopped: vi.fn(() => false), resetRestartAttempts: vi.fn(), };