gateway: add health monitor enable overrides

This commit is contained in:
Tak Hoffman 2026-03-14 20:07:13 -05:00
parent e1c9972f31
commit 3658475238
25 changed files with 189 additions and 11 deletions

View File

@ -57,6 +57,10 @@ export type BlueBubblesAccountConfig = {
allowPrivateNetwork?: boolean;
/** Per-group configuration keyed by chat GUID or identifier. */
groups?: Record<string, BlueBubblesGroupConfig>;
/** Channel health monitor overrides for this channel/account. */
healthMonitor?: {
enabled?: boolean;
};
};
export type BlueBubblesActionConfig = {

View File

@ -102,6 +102,8 @@ export const FIELD_HELP: Record<string, string> = {
"Explicit gateway-level tool denylist to block risky tools even if lower-level policies allow them. Use deny rules for emergency response and defense-in-depth hardening.",
"gateway.channelHealthCheckMinutes":
"Interval in minutes for automatic channel health probing and status updates. Use lower intervals for faster detection, or higher intervals to reduce periodic probe noise.",
"gateway.channelHealthMonitorEnabled":
"Global enable switch for the gateway channel health monitor. Set false to disable all health-monitor-initiated channel restarts; per-channel healthMonitor.enabled overrides can further disable individual channels or accounts when the global monitor stays on.",
"gateway.channelStaleEventThresholdMinutes":
"How many minutes a connected channel can go without receiving any event before the health monitor treats it as a stale socket and triggers a restart. Default: 30.",
"gateway.channelMaxRestartsPerHour":

View File

@ -83,6 +83,7 @@ export const FIELD_LABELS: Record<string, string> = {
"gateway.tools": "Gateway Tool Exposure Policy",
"gateway.tools.allow": "Gateway Tool Allowlist",
"gateway.tools.deny": "Gateway Tool Denylist",
"gateway.channelHealthMonitorEnabled": "Gateway Channel Health Monitor Enabled",
"gateway.channelHealthCheckMinutes": "Gateway Channel Health Check Interval (min)",
"gateway.channelStaleEventThresholdMinutes": "Gateway Channel Stale Event Threshold (min)",
"gateway.channelMaxRestartsPerHour": "Gateway Channel Max Restarts Per Hour",

View File

@ -4,7 +4,10 @@ import type {
GroupPolicy,
MarkdownConfig,
} from "./types.base.js";
import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js";
import type {
ChannelHealthMonitorConfig,
ChannelHeartbeatVisibilityConfig,
} from "./types.channels.js";
import type { DmConfig } from "./types.messages.js";
export type CommonChannelMessagingConfig = {
@ -43,6 +46,8 @@ export type CommonChannelMessagingConfig = {
blockStreamingCoalesce?: BlockStreamingCoalesceConfig;
/** Heartbeat visibility settings for this channel. */
heartbeat?: ChannelHeartbeatVisibilityConfig;
/** Channel health monitor overrides for this channel/account. */
healthMonitor?: ChannelHealthMonitorConfig;
/** Outbound response prefix override for this channel/account. */
responsePrefix?: string;
/** Max outbound media size in MB. */

View File

@ -18,6 +18,14 @@ export type ChannelHeartbeatVisibilityConfig = {
useIndicator?: boolean;
};
export type ChannelHealthMonitorConfig = {
/**
* Enable channel-health-monitor restarts for this channel or account.
* Inherits the global gateway setting when omitted.
*/
enabled?: boolean;
};
export type ChannelDefaultsConfig = {
groupPolicy?: GroupPolicy;
/** Default heartbeat visibility for all channels. */
@ -39,6 +47,7 @@ export type ExtensionChannelConfig = {
defaultAccount?: string;
dmPolicy?: string;
groupPolicy?: GroupPolicy;
healthMonitor?: ChannelHealthMonitorConfig;
accounts?: Record<string, unknown>;
[key: string]: unknown;
};

View File

@ -8,7 +8,10 @@ import type {
OutboundRetryConfig,
ReplyToMode,
} from "./types.base.js";
import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js";
import type {
ChannelHealthMonitorConfig,
ChannelHeartbeatVisibilityConfig,
} from "./types.channels.js";
import type { DmConfig, ProviderCommandsConfig } from "./types.messages.js";
import type { SecretInput } from "./types.secrets.js";
import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js";
@ -297,6 +300,8 @@ export type DiscordAccountConfig = {
guilds?: Record<string, DiscordGuildEntry>;
/** Heartbeat visibility settings for this channel. */
heartbeat?: ChannelHeartbeatVisibilityConfig;
/** Channel health monitor overrides for this channel/account. */
healthMonitor?: ChannelHealthMonitorConfig;
/** Exec approval forwarding configuration. */
execApprovals?: DiscordExecApprovalConfig;
/** Agent-controlled interactive components (buttons, select menus). */

View File

@ -425,6 +425,12 @@ export type GatewayConfig = {
allowRealIpFallback?: boolean;
/** Tool access restrictions for HTTP /tools/invoke endpoint. */
tools?: GatewayToolsConfig;
/**
* Global enable switch for the channel health monitor.
* Set to false to disable health-monitor-driven channel restarts entirely.
* Default: true.
*/
channelHealthMonitorEnabled?: boolean;
/**
* Channel health monitor interval in minutes.
* Periodically checks channel health and restarts unhealthy channels.

View File

@ -4,6 +4,7 @@ import type {
GroupPolicy,
ReplyToMode,
} from "./types.base.js";
import type { ChannelHealthMonitorConfig } from "./types.channels.js";
import type { DmConfig } from "./types.messages.js";
import type { SecretRef } from "./types.secrets.js";
@ -99,6 +100,8 @@ export type GoogleChatAccountConfig = {
/** Per-action tool gating (default: true for all). */
actions?: GoogleChatActionConfig;
dm?: GoogleChatDmConfig;
/** Channel health monitor overrides for this channel/account. */
healthMonitor?: ChannelHealthMonitorConfig;
/**
* Typing indicator mode (default: "message").
* - "none": No indicator

View File

@ -4,7 +4,10 @@ import type {
GroupPolicy,
MarkdownConfig,
} from "./types.base.js";
import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js";
import type {
ChannelHealthMonitorConfig,
ChannelHeartbeatVisibilityConfig,
} from "./types.channels.js";
import type { DmConfig } from "./types.messages.js";
import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js";
@ -77,6 +80,8 @@ export type IMessageAccountConfig = {
>;
/** Heartbeat visibility settings for this channel. */
heartbeat?: ChannelHeartbeatVisibilityConfig;
/** Channel health monitor overrides for this channel/account. */
healthMonitor?: ChannelHealthMonitorConfig;
/** Outbound response prefix override for this channel/account. */
responsePrefix?: string;
};

View File

@ -4,7 +4,10 @@ import type {
GroupPolicy,
MarkdownConfig,
} from "./types.base.js";
import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js";
import type {
ChannelHealthMonitorConfig,
ChannelHeartbeatVisibilityConfig,
} from "./types.channels.js";
import type { DmConfig } from "./types.messages.js";
import type { SecretInput } from "./types.secrets.js";
import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js";
@ -114,6 +117,8 @@ export type MSTeamsConfig = {
sharePointSiteId?: string;
/** Heartbeat visibility settings for this channel. */
heartbeat?: ChannelHeartbeatVisibilityConfig;
/** Channel health monitor overrides for this channel. */
healthMonitor?: ChannelHealthMonitorConfig;
/** Outbound response prefix override for this channel/account. */
responsePrefix?: string;
};

View File

@ -5,7 +5,10 @@ import type {
MarkdownConfig,
ReplyToMode,
} from "./types.base.js";
import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js";
import type {
ChannelHealthMonitorConfig,
ChannelHeartbeatVisibilityConfig,
} from "./types.channels.js";
import type { DmConfig, ProviderCommandsConfig } from "./types.messages.js";
import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js";
@ -185,6 +188,8 @@ export type SlackAccountConfig = {
channels?: Record<string, SlackChannelConfig>;
/** Heartbeat visibility settings for this channel. */
heartbeat?: ChannelHeartbeatVisibilityConfig;
/** Channel health monitor overrides for this channel/account. */
healthMonitor?: ChannelHealthMonitorConfig;
/** Outbound response prefix override for this channel/account. */
responsePrefix?: string;
/**

View File

@ -8,7 +8,10 @@ import type {
ReplyToMode,
SessionThreadBindingsConfig,
} from "./types.base.js";
import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js";
import type {
ChannelHealthMonitorConfig,
ChannelHeartbeatVisibilityConfig,
} from "./types.channels.js";
import type { DmConfig, ProviderCommandsConfig } from "./types.messages.js";
import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js";
@ -179,6 +182,8 @@ export type TelegramAccountConfig = {
reactionLevel?: "off" | "ack" | "minimal" | "extensive";
/** Heartbeat visibility settings for this channel. */
heartbeat?: ChannelHeartbeatVisibilityConfig;
/** Channel health monitor overrides for this channel/account. */
healthMonitor?: ChannelHealthMonitorConfig;
/** Controls whether link previews are shown in outbound messages. Default: true. */
linkPreview?: boolean;
/**

View File

@ -4,7 +4,10 @@ import type {
GroupPolicy,
MarkdownConfig,
} from "./types.base.js";
import type { ChannelHeartbeatVisibilityConfig } from "./types.channels.js";
import type {
ChannelHealthMonitorConfig,
ChannelHeartbeatVisibilityConfig,
} from "./types.channels.js";
import type { DmConfig } from "./types.messages.js";
import type { GroupToolPolicyBySenderConfig, GroupToolPolicyConfig } from "./types.tools.js";
@ -78,6 +81,8 @@ type WhatsAppSharedConfig = {
debounceMs?: number;
/** Heartbeat visibility settings. */
heartbeat?: ChannelHeartbeatVisibilityConfig;
/** Channel health monitor overrides for this channel/account. */
healthMonitor?: ChannelHealthMonitorConfig;
};
type WhatsAppConfigCore = {

View File

@ -8,3 +8,10 @@ export const ChannelHeartbeatVisibilitySchema = z
})
.strict()
.optional();
export const ChannelHealthMonitorSchema = z
.object({
enabled: z.boolean().optional(),
})
.strict()
.optional();

View File

@ -13,7 +13,10 @@ import {
resolveTelegramCustomCommands,
} from "./telegram-custom-commands.js";
import { ToolPolicySchema } from "./zod-schema.agent-runtime.js";
import { ChannelHeartbeatVisibilitySchema } from "./zod-schema.channels.js";
import {
ChannelHealthMonitorSchema,
ChannelHeartbeatVisibilitySchema,
} from "./zod-schema.channels.js";
import {
BlockStreamingChunkSchema,
BlockStreamingCoalesceSchema,
@ -271,6 +274,7 @@ export const TelegramAccountSchemaBase = z
reactionNotifications: z.enum(["off", "own", "all"]).optional(),
reactionLevel: z.enum(["off", "ack", "minimal", "extensive"]).optional(),
heartbeat: ChannelHeartbeatVisibilitySchema,
healthMonitor: ChannelHealthMonitorSchema,
linkPreview: z.boolean().optional(),
responsePrefix: z.string().optional(),
ackReaction: z.string().optional(),
@ -511,6 +515,7 @@ export const DiscordAccountSchema = z
dm: DiscordDmSchema.optional(),
guilds: z.record(z.string(), DiscordGuildSchema.optional()).optional(),
heartbeat: ChannelHeartbeatVisibilitySchema,
healthMonitor: ChannelHealthMonitorSchema,
execApprovals: z
.object({
enabled: z.boolean().optional(),
@ -782,6 +787,7 @@ export const GoogleChatAccountSchema = z
.strict()
.optional(),
dm: GoogleChatDmSchema.optional(),
healthMonitor: ChannelHealthMonitorSchema,
typingIndicator: z.enum(["none", "message", "reaction"]).optional(),
responsePrefix: z.string().optional(),
})
@ -898,6 +904,7 @@ export const SlackAccountSchema = z
dm: SlackDmSchema.optional(),
channels: z.record(z.string(), SlackChannelSchema.optional()).optional(),
heartbeat: ChannelHeartbeatVisibilitySchema,
healthMonitor: ChannelHealthMonitorSchema,
responsePrefix: z.string().optional(),
ackReaction: z.string().optional(),
typingReaction: z.string().optional(),
@ -1032,6 +1039,7 @@ export const SignalAccountSchemaBase = z
.optional(),
reactionLevel: z.enum(["off", "ack", "minimal", "extensive"]).optional(),
heartbeat: ChannelHeartbeatVisibilitySchema,
healthMonitor: ChannelHealthMonitorSchema,
responsePrefix: z.string().optional(),
})
.strict();
@ -1145,6 +1153,7 @@ export const IrcAccountSchemaBase = z
blockStreamingCoalesce: BlockStreamingCoalesceSchema.optional(),
mediaMaxMb: z.number().positive().optional(),
heartbeat: ChannelHeartbeatVisibilitySchema,
healthMonitor: ChannelHealthMonitorSchema,
responsePrefix: z.string().optional(),
})
.strict();
@ -1272,6 +1281,7 @@ export const IMessageAccountSchemaBase = z
)
.optional(),
heartbeat: ChannelHeartbeatVisibilitySchema,
healthMonitor: ChannelHealthMonitorSchema,
responsePrefix: z.string().optional(),
})
.strict();
@ -1383,6 +1393,7 @@ export const BlueBubblesAccountSchemaBase = z
blockStreamingCoalesce: BlockStreamingCoalesceSchema.optional(),
groups: z.record(z.string(), BlueBubblesGroupConfigSchema.optional()).optional(),
heartbeat: ChannelHeartbeatVisibilitySchema,
healthMonitor: ChannelHealthMonitorSchema,
responsePrefix: z.string().optional(),
})
.strict();
@ -1499,6 +1510,7 @@ export const MSTeamsConfigSchema = z
/** SharePoint site ID for file uploads in group chats/channels (e.g., "contoso.sharepoint.com,guid1,guid2") */
sharePointSiteId: z.string().optional(),
heartbeat: ChannelHeartbeatVisibilitySchema,
healthMonitor: ChannelHealthMonitorSchema,
responsePrefix: z.string().optional(),
})
.strict()

View File

@ -1,6 +1,9 @@
import { z } from "zod";
import { ToolPolicySchema } from "./zod-schema.agent-runtime.js";
import { ChannelHeartbeatVisibilitySchema } from "./zod-schema.channels.js";
import {
ChannelHealthMonitorSchema,
ChannelHeartbeatVisibilitySchema,
} from "./zod-schema.channels.js";
import {
BlockStreamingCoalesceSchema,
DmConfigSchema,
@ -56,6 +59,7 @@ const WhatsAppSharedSchema = z.object({
ackReaction: WhatsAppAckReactionSchema,
debounceMs: z.number().int().nonnegative().optional().default(0),
heartbeat: ChannelHeartbeatVisibilitySchema,
healthMonitor: ChannelHealthMonitorSchema,
});
function enforceOpenDmPolicyAllowFromStar(params: {

View File

@ -695,6 +695,7 @@ export const OpenClawSchema = z
})
.strict()
.optional(),
channelHealthMonitorEnabled: z.boolean().optional(),
channelHealthCheckMinutes: z.number().int().min(0).optional(),
channelStaleEventThresholdMinutes: z.number().int().min(1).optional(),
channelMaxRestartsPerHour: z.number().int().min(1).optional(),

View File

@ -11,6 +11,7 @@ function createMockChannelManager(overrides?: Partial<ChannelManager>): ChannelM
startChannel: vi.fn(async () => {}),
stopChannel: vi.fn(async () => {}),
markChannelLoggedOut: vi.fn(),
isHealthMonitorEnabled: vi.fn(() => true),
isManuallyStopped: vi.fn(() => false),
resetRestartAttempts: vi.fn(),
...overrides,
@ -226,6 +227,53 @@ describe("channel-health-monitor", () => {
await expectNoStart(manager);
});
it("skips channels with health monitor disabled globally for that account", async () => {
const manager = createSnapshotManager(
{
discord: {
default: { running: false, enabled: true, configured: true },
},
},
{ isHealthMonitorEnabled: vi.fn(() => false) },
);
await expectNoStart(manager);
});
it("still restarts enabled accounts when another account on the same channel is disabled", async () => {
const now = Date.now();
const manager = createSnapshotManager(
{
discord: {
default: {
running: true,
connected: false,
enabled: true,
configured: true,
lastStartAt: now - 300_000,
},
quiet: {
running: true,
connected: false,
enabled: true,
configured: true,
lastStartAt: now - 300_000,
},
},
},
{
isHealthMonitorEnabled: vi.fn((channelId: ChannelId, accountId: string) => {
return !(channelId === "discord" && accountId === "quiet");
}),
},
);
const monitor = await startAndRunCheck(manager);
expect(manager.stopChannel).toHaveBeenCalledWith("discord", "default");
expect(manager.startChannel).toHaveBeenCalledWith("discord", "default");
expect(manager.stopChannel).not.toHaveBeenCalledWith("discord", "quiet");
expect(manager.startChannel).not.toHaveBeenCalledWith("discord", "quiet");
monitor.stop();
});
it("restarts a stuck channel (running but not connected)", async () => {
const now = Date.now();
const manager = createSnapshotManager({

View File

@ -118,6 +118,9 @@ export function startChannelHealthMonitor(deps: ChannelHealthMonitorDeps): Chann
if (!status) {
continue;
}
if (!channelManager.isHealthMonitorEnabled(channelId as ChannelId, accountId)) {
continue;
}
if (channelManager.isManuallyStopped(channelId as ChannelId, accountId)) {
continue;
}

View File

@ -36,6 +36,11 @@ type ReloadAction =
const BASE_RELOAD_RULES: ReloadRule[] = [
{ prefix: "gateway.remote", kind: "none" },
{ prefix: "gateway.reload", kind: "none" },
{
prefix: "gateway.channelHealthMonitorEnabled",
kind: "hot",
actions: ["restart-health-monitor"],
},
{
prefix: "gateway.channelHealthCheckMinutes",
kind: "hot",

View File

@ -31,6 +31,16 @@ type ChannelRuntimeStore = {
runtimes: Map<string, ChannelAccountSnapshot>;
};
type RawHealthMonitorEntry = {
healthMonitor?: {
enabled?: boolean;
};
};
type RawChannelConfig = RawHealthMonitorEntry & {
accounts?: Record<string, RawHealthMonitorEntry | undefined>;
};
function createRuntimeStore(): ChannelRuntimeStore {
return {
aborts: new Map(),
@ -105,6 +115,7 @@ export type ChannelManager = {
markChannelLoggedOut: (channelId: ChannelId, cleared: boolean, accountId?: string) => void;
isManuallyStopped: (channelId: ChannelId, accountId: string) => boolean;
resetRestartAttempts: (channelId: ChannelId, accountId: string) => void;
isHealthMonitorEnabled: (channelId: ChannelId, accountId: string) => boolean;
};
// Channel docking: lifecycle hooks (`plugin.gateway`) flow through this manager.
@ -119,6 +130,26 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
const restartKey = (channelId: ChannelId, accountId: string) => `${channelId}:${accountId}`;
const isHealthMonitorEnabled = (channelId: ChannelId, accountId: string): boolean => {
const cfg = loadConfig();
if (cfg.gateway?.channelHealthMonitorEnabled === false) {
return false;
}
const channelConfig = cfg.channels?.[channelId] as RawChannelConfig | undefined;
const accountOverride = channelConfig?.accounts?.[accountId]?.healthMonitor?.enabled;
if (typeof accountOverride === "boolean") {
return accountOverride;
}
const channelOverride = channelConfig?.healthMonitor?.enabled;
if (typeof channelOverride === "boolean") {
return channelOverride;
}
return true;
};
const getStore = (channelId: ChannelId): ChannelRuntimeStore => {
const existing = channelStores.get(channelId);
if (existing) {
@ -453,5 +484,6 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
markChannelLoggedOut,
isManuallyStopped: isManuallyStopped_,
resetRestartAttempts: resetRestartAttempts_,
isHealthMonitorEnabled,
};
}

View File

@ -104,10 +104,11 @@ export function createGatewayReloadHandlers(params: {
if (plan.restartHealthMonitor) {
state.channelHealthMonitor?.stop();
const enabled = nextConfig.gateway?.channelHealthMonitorEnabled !== false;
const minutes = nextConfig.gateway?.channelHealthCheckMinutes;
const staleMinutes = nextConfig.gateway?.channelStaleEventThresholdMinutes;
nextState.channelHealthMonitor =
minutes === 0
!enabled || minutes === 0
? null
: params.createHealthMonitor({
checkIntervalMs: (minutes ?? 5) * 60_000,

View File

@ -755,8 +755,9 @@ export async function startGatewayServer(
}
: startHeartbeatRunner({ cfg: cfgAtStart });
const healthMonitorEnabled = cfgAtStart.gateway?.channelHealthMonitorEnabled !== false;
const healthCheckMinutes = cfgAtStart.gateway?.channelHealthCheckMinutes;
const healthCheckDisabled = healthCheckMinutes === 0;
const healthCheckDisabled = !healthMonitorEnabled || healthCheckMinutes === 0;
const staleEventThresholdMinutes = cfgAtStart.gateway?.channelStaleEventThresholdMinutes;
const maxRestartsPerHour = cfgAtStart.gateway?.channelMaxRestartsPerHour;
let channelHealthMonitor = healthCheckDisabled

View File

@ -109,6 +109,9 @@ const hoisted = vi.hoisted(() => {
startChannel: vi.fn(async () => {}),
stopChannel: vi.fn(async () => {}),
markChannelLoggedOut: vi.fn(),
isHealthMonitorEnabled: vi.fn(() => true),
isManuallyStopped: vi.fn(() => false),
resetRestartAttempts: vi.fn(),
};
const createChannelManager = vi.fn(() => providerManager);

View File

@ -26,6 +26,7 @@ function createManager(snapshot: ChannelRuntimeSnapshot): ChannelManager {
startChannel: vi.fn(),
stopChannel: vi.fn(),
markChannelLoggedOut: vi.fn(),
isHealthMonitorEnabled: vi.fn(() => true),
isManuallyStopped: vi.fn(() => false),
resetRestartAttempts: vi.fn(),
};