diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 555ee02b8eb..7fbfdec76d8 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -102,6 +102,10 @@ export const FIELD_HELP: Record = { "Explicit gateway-level tool denylist to block risky tools even if lower-level policies allow them. Use deny rules for emergency response and defense-in-depth hardening.", "gateway.channelHealthCheckMinutes": "Interval in minutes for automatic channel health probing and status updates. Use lower intervals for faster detection, or higher intervals to reduce periodic probe noise.", + "gateway.channelStaleEventThresholdMinutes": + "How many minutes a connected channel can go without receiving any event before the health monitor treats it as a stale socket and triggers a restart. Default: 30.", + "gateway.channelMaxRestartsPerHour": + "Maximum number of health-monitor-initiated channel restarts allowed within a rolling one-hour window. Once hit, further restarts are skipped until the window expires. Default: 10.", "gateway.tailscale": "Tailscale integration settings for Serve/Funnel exposure and lifecycle handling on gateway start/exit. Keep off unless your deployment intentionally relies on Tailscale ingress.", "gateway.tailscale.mode": diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index 9b1fdb73445..e700f2329b4 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -84,6 +84,8 @@ export const FIELD_LABELS: Record = { "gateway.tools.allow": "Gateway Tool Allowlist", "gateway.tools.deny": "Gateway Tool Denylist", "gateway.channelHealthCheckMinutes": "Gateway Channel Health Check Interval (min)", + "gateway.channelStaleEventThresholdMinutes": "Gateway Channel Stale Event Threshold (min)", + "gateway.channelMaxRestartsPerHour": "Gateway Channel Max Restarts Per Hour", "gateway.tailscale": "Gateway Tailscale", "gateway.tailscale.mode": "Gateway Tailscale Mode", "gateway.tailscale.resetOnExit": "Gateway Tailscale Reset on Exit", diff --git a/src/config/types.gateway.ts b/src/config/types.gateway.ts index ea17a1d9d05..88a5350ab1d 100644 --- a/src/config/types.gateway.ts +++ b/src/config/types.gateway.ts @@ -431,4 +431,16 @@ export type GatewayConfig = { * Set to 0 to disable. Default: 5. */ channelHealthCheckMinutes?: number; + /** + * Stale event threshold in minutes for the channel health monitor. + * A connected channel that receives no events for this duration is treated + * as a stale socket and restarted. Default: 30. + */ + channelStaleEventThresholdMinutes?: number; + /** + * Maximum number of health-monitor-initiated channel restarts per hour. + * Once this limit is reached, the monitor skips further restarts until + * the rolling window expires. Default: 10. + */ + channelMaxRestartsPerHour?: number; }; diff --git a/src/config/zod-schema.ts b/src/config/zod-schema.ts index 8c78d049d0e..dac3e61f94c 100644 --- a/src/config/zod-schema.ts +++ b/src/config/zod-schema.ts @@ -696,6 +696,8 @@ export const OpenClawSchema = z .strict() .optional(), channelHealthCheckMinutes: z.number().int().min(0).optional(), + channelStaleEventThresholdMinutes: z.number().int().min(1).optional(), + channelMaxRestartsPerHour: z.number().int().min(1).optional(), tailscale: z .object({ mode: z.union([z.literal("off"), z.literal("serve"), z.literal("funnel")]).optional(), diff --git a/src/gateway/config-reload-plan.ts b/src/gateway/config-reload-plan.ts index 4ca1fcea7f0..63eddd31c54 100644 --- a/src/gateway/config-reload-plan.ts +++ b/src/gateway/config-reload-plan.ts @@ -41,6 +41,16 @@ const BASE_RELOAD_RULES: ReloadRule[] = [ kind: "hot", actions: ["restart-health-monitor"], }, + { + prefix: "gateway.channelStaleEventThresholdMinutes", + kind: "hot", + actions: ["restart-health-monitor"], + }, + { + prefix: "gateway.channelMaxRestartsPerHour", + kind: "hot", + actions: ["restart-health-monitor"], + }, // Stuck-session warning threshold is read by the diagnostics heartbeat loop. { prefix: "diagnostics.stuckSessionWarnMs", kind: "none" }, { prefix: "hooks.gmail", kind: "hot", actions: ["restart-gmail-watcher"] }, diff --git a/src/gateway/server-reload-handlers.ts b/src/gateway/server-reload-handlers.ts index f9cfb9111fe..008f0977d37 100644 --- a/src/gateway/server-reload-handlers.ts +++ b/src/gateway/server-reload-handlers.ts @@ -50,7 +50,11 @@ export function createGatewayReloadHandlers(params: { logChannels: { info: (msg: string) => void; error: (msg: string) => void }; logCron: { error: (msg: string) => void }; logReload: { info: (msg: string) => void; warn: (msg: string) => void }; - createHealthMonitor: (checkIntervalMs: number) => ChannelHealthMonitor; + createHealthMonitor: (opts: { + checkIntervalMs: number; + staleEventThresholdMs?: number; + maxRestartsPerHour?: number; + }) => ChannelHealthMonitor; }) { const applyHotReload = async ( plan: GatewayReloadPlan, @@ -101,8 +105,17 @@ export function createGatewayReloadHandlers(params: { if (plan.restartHealthMonitor) { state.channelHealthMonitor?.stop(); const minutes = nextConfig.gateway?.channelHealthCheckMinutes; + const staleMinutes = nextConfig.gateway?.channelStaleEventThresholdMinutes; nextState.channelHealthMonitor = - minutes === 0 ? null : params.createHealthMonitor((minutes ?? 5) * 60_000); + minutes === 0 + ? null + : params.createHealthMonitor({ + checkIntervalMs: (minutes ?? 5) * 60_000, + ...(staleMinutes != null && { staleEventThresholdMs: staleMinutes * 60_000 }), + ...(nextConfig.gateway?.channelMaxRestartsPerHour != null && { + maxRestartsPerHour: nextConfig.gateway.channelMaxRestartsPerHour, + }), + }); } if (plan.restartGmailWatcher) { diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index 9b3941d1432..5453ff8fcee 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -757,11 +757,17 @@ export async function startGatewayServer( const healthCheckMinutes = cfgAtStart.gateway?.channelHealthCheckMinutes; const healthCheckDisabled = healthCheckMinutes === 0; + const staleEventThresholdMinutes = cfgAtStart.gateway?.channelStaleEventThresholdMinutes; + const maxRestartsPerHour = cfgAtStart.gateway?.channelMaxRestartsPerHour; let channelHealthMonitor = healthCheckDisabled ? null : startChannelHealthMonitor({ channelManager, checkIntervalMs: (healthCheckMinutes ?? 5) * 60_000, + ...(staleEventThresholdMinutes != null && { + staleEventThresholdMs: staleEventThresholdMinutes * 60_000, + }), + ...(maxRestartsPerHour != null && { maxRestartsPerHour }), }); if (!minimalTestGateway) { @@ -980,8 +986,21 @@ export async function startGatewayServer( logChannels, logCron, logReload, - createHealthMonitor: (checkIntervalMs: number) => - startChannelHealthMonitor({ channelManager, checkIntervalMs }), + createHealthMonitor: (opts: { + checkIntervalMs: number; + staleEventThresholdMs?: number; + maxRestartsPerHour?: number; + }) => + startChannelHealthMonitor({ + channelManager, + checkIntervalMs: opts.checkIntervalMs, + ...(opts.staleEventThresholdMs != null && { + staleEventThresholdMs: opts.staleEventThresholdMs, + }), + ...(opts.maxRestartsPerHour != null && { + maxRestartsPerHour: opts.maxRestartsPerHour, + }), + }), }); return startGatewayConfigReloader({