feat(gateway): make health monitor stale threshold and max restarts configurable

This commit is contained in:
rstar327 2026-03-10 04:51:38 +02:00 committed by Tak Hoffman
parent 4c6a7f84a4
commit 47c865b610
7 changed files with 66 additions and 4 deletions

View File

@ -102,6 +102,10 @@ export const FIELD_HELP: Record<string, string> = {
"Explicit gateway-level tool denylist to block risky tools even if lower-level policies allow them. Use deny rules for emergency response and defense-in-depth hardening.",
"gateway.channelHealthCheckMinutes":
"Interval in minutes for automatic channel health probing and status updates. Use lower intervals for faster detection, or higher intervals to reduce periodic probe noise.",
"gateway.channelStaleEventThresholdMinutes":
"How many minutes a connected channel can go without receiving any event before the health monitor treats it as a stale socket and triggers a restart. Default: 30.",
"gateway.channelMaxRestartsPerHour":
"Maximum number of health-monitor-initiated channel restarts allowed within a rolling one-hour window. Once hit, further restarts are skipped until the window expires. Default: 10.",
"gateway.tailscale":
"Tailscale integration settings for Serve/Funnel exposure and lifecycle handling on gateway start/exit. Keep off unless your deployment intentionally relies on Tailscale ingress.",
"gateway.tailscale.mode":

View File

@ -84,6 +84,8 @@ export const FIELD_LABELS: Record<string, string> = {
"gateway.tools.allow": "Gateway Tool Allowlist",
"gateway.tools.deny": "Gateway Tool Denylist",
"gateway.channelHealthCheckMinutes": "Gateway Channel Health Check Interval (min)",
"gateway.channelStaleEventThresholdMinutes": "Gateway Channel Stale Event Threshold (min)",
"gateway.channelMaxRestartsPerHour": "Gateway Channel Max Restarts Per Hour",
"gateway.tailscale": "Gateway Tailscale",
"gateway.tailscale.mode": "Gateway Tailscale Mode",
"gateway.tailscale.resetOnExit": "Gateway Tailscale Reset on Exit",

View File

@ -431,4 +431,16 @@ export type GatewayConfig = {
* Set to 0 to disable. Default: 5.
*/
channelHealthCheckMinutes?: number;
/**
* Stale event threshold in minutes for the channel health monitor.
* A connected channel that receives no events for this duration is treated
* as a stale socket and restarted. Default: 30.
*/
channelStaleEventThresholdMinutes?: number;
/**
* Maximum number of health-monitor-initiated channel restarts per hour.
* Once this limit is reached, the monitor skips further restarts until
* the rolling window expires. Default: 10.
*/
channelMaxRestartsPerHour?: number;
};

View File

@ -696,6 +696,8 @@ export const OpenClawSchema = z
.strict()
.optional(),
channelHealthCheckMinutes: z.number().int().min(0).optional(),
channelStaleEventThresholdMinutes: z.number().int().min(1).optional(),
channelMaxRestartsPerHour: z.number().int().min(1).optional(),
tailscale: z
.object({
mode: z.union([z.literal("off"), z.literal("serve"), z.literal("funnel")]).optional(),

View File

@ -41,6 +41,16 @@ const BASE_RELOAD_RULES: ReloadRule[] = [
kind: "hot",
actions: ["restart-health-monitor"],
},
{
prefix: "gateway.channelStaleEventThresholdMinutes",
kind: "hot",
actions: ["restart-health-monitor"],
},
{
prefix: "gateway.channelMaxRestartsPerHour",
kind: "hot",
actions: ["restart-health-monitor"],
},
// Stuck-session warning threshold is read by the diagnostics heartbeat loop.
{ prefix: "diagnostics.stuckSessionWarnMs", kind: "none" },
{ prefix: "hooks.gmail", kind: "hot", actions: ["restart-gmail-watcher"] },

View File

@ -50,7 +50,11 @@ export function createGatewayReloadHandlers(params: {
logChannels: { info: (msg: string) => void; error: (msg: string) => void };
logCron: { error: (msg: string) => void };
logReload: { info: (msg: string) => void; warn: (msg: string) => void };
createHealthMonitor: (checkIntervalMs: number) => ChannelHealthMonitor;
createHealthMonitor: (opts: {
checkIntervalMs: number;
staleEventThresholdMs?: number;
maxRestartsPerHour?: number;
}) => ChannelHealthMonitor;
}) {
const applyHotReload = async (
plan: GatewayReloadPlan,
@ -101,8 +105,17 @@ export function createGatewayReloadHandlers(params: {
if (plan.restartHealthMonitor) {
state.channelHealthMonitor?.stop();
const minutes = nextConfig.gateway?.channelHealthCheckMinutes;
const staleMinutes = nextConfig.gateway?.channelStaleEventThresholdMinutes;
nextState.channelHealthMonitor =
minutes === 0 ? null : params.createHealthMonitor((minutes ?? 5) * 60_000);
minutes === 0
? null
: params.createHealthMonitor({
checkIntervalMs: (minutes ?? 5) * 60_000,
...(staleMinutes != null && { staleEventThresholdMs: staleMinutes * 60_000 }),
...(nextConfig.gateway?.channelMaxRestartsPerHour != null && {
maxRestartsPerHour: nextConfig.gateway.channelMaxRestartsPerHour,
}),
});
}
if (plan.restartGmailWatcher) {

View File

@ -757,11 +757,17 @@ export async function startGatewayServer(
const healthCheckMinutes = cfgAtStart.gateway?.channelHealthCheckMinutes;
const healthCheckDisabled = healthCheckMinutes === 0;
const staleEventThresholdMinutes = cfgAtStart.gateway?.channelStaleEventThresholdMinutes;
const maxRestartsPerHour = cfgAtStart.gateway?.channelMaxRestartsPerHour;
let channelHealthMonitor = healthCheckDisabled
? null
: startChannelHealthMonitor({
channelManager,
checkIntervalMs: (healthCheckMinutes ?? 5) * 60_000,
...(staleEventThresholdMinutes != null && {
staleEventThresholdMs: staleEventThresholdMinutes * 60_000,
}),
...(maxRestartsPerHour != null && { maxRestartsPerHour }),
});
if (!minimalTestGateway) {
@ -980,8 +986,21 @@ export async function startGatewayServer(
logChannels,
logCron,
logReload,
createHealthMonitor: (checkIntervalMs: number) =>
startChannelHealthMonitor({ channelManager, checkIntervalMs }),
createHealthMonitor: (opts: {
checkIntervalMs: number;
staleEventThresholdMs?: number;
maxRestartsPerHour?: number;
}) =>
startChannelHealthMonitor({
channelManager,
checkIntervalMs: opts.checkIntervalMs,
...(opts.staleEventThresholdMs != null && {
staleEventThresholdMs: opts.staleEventThresholdMs,
}),
...(opts.maxRestartsPerHour != null && {
maxRestartsPerHour: opts.maxRestartsPerHour,
}),
}),
});
return startGatewayConfigReloader({