mirror of https://github.com/openclaw/openclaw.git
feat(gateway): make health monitor stale threshold and max restarts configurable
This commit is contained in:
parent
4c6a7f84a4
commit
47c865b610
|
|
@ -102,6 +102,10 @@ export const FIELD_HELP: Record<string, string> = {
|
|||
"Explicit gateway-level tool denylist to block risky tools even if lower-level policies allow them. Use deny rules for emergency response and defense-in-depth hardening.",
|
||||
"gateway.channelHealthCheckMinutes":
|
||||
"Interval in minutes for automatic channel health probing and status updates. Use lower intervals for faster detection, or higher intervals to reduce periodic probe noise.",
|
||||
"gateway.channelStaleEventThresholdMinutes":
|
||||
"How many minutes a connected channel can go without receiving any event before the health monitor treats it as a stale socket and triggers a restart. Default: 30.",
|
||||
"gateway.channelMaxRestartsPerHour":
|
||||
"Maximum number of health-monitor-initiated channel restarts allowed within a rolling one-hour window. Once hit, further restarts are skipped until the window expires. Default: 10.",
|
||||
"gateway.tailscale":
|
||||
"Tailscale integration settings for Serve/Funnel exposure and lifecycle handling on gateway start/exit. Keep off unless your deployment intentionally relies on Tailscale ingress.",
|
||||
"gateway.tailscale.mode":
|
||||
|
|
|
|||
|
|
@ -84,6 +84,8 @@ export const FIELD_LABELS: Record<string, string> = {
|
|||
"gateway.tools.allow": "Gateway Tool Allowlist",
|
||||
"gateway.tools.deny": "Gateway Tool Denylist",
|
||||
"gateway.channelHealthCheckMinutes": "Gateway Channel Health Check Interval (min)",
|
||||
"gateway.channelStaleEventThresholdMinutes": "Gateway Channel Stale Event Threshold (min)",
|
||||
"gateway.channelMaxRestartsPerHour": "Gateway Channel Max Restarts Per Hour",
|
||||
"gateway.tailscale": "Gateway Tailscale",
|
||||
"gateway.tailscale.mode": "Gateway Tailscale Mode",
|
||||
"gateway.tailscale.resetOnExit": "Gateway Tailscale Reset on Exit",
|
||||
|
|
|
|||
|
|
@ -431,4 +431,16 @@ export type GatewayConfig = {
|
|||
* Set to 0 to disable. Default: 5.
|
||||
*/
|
||||
channelHealthCheckMinutes?: number;
|
||||
/**
|
||||
* Stale event threshold in minutes for the channel health monitor.
|
||||
* A connected channel that receives no events for this duration is treated
|
||||
* as a stale socket and restarted. Default: 30.
|
||||
*/
|
||||
channelStaleEventThresholdMinutes?: number;
|
||||
/**
|
||||
* Maximum number of health-monitor-initiated channel restarts per hour.
|
||||
* Once this limit is reached, the monitor skips further restarts until
|
||||
* the rolling window expires. Default: 10.
|
||||
*/
|
||||
channelMaxRestartsPerHour?: number;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -696,6 +696,8 @@ export const OpenClawSchema = z
|
|||
.strict()
|
||||
.optional(),
|
||||
channelHealthCheckMinutes: z.number().int().min(0).optional(),
|
||||
channelStaleEventThresholdMinutes: z.number().int().min(1).optional(),
|
||||
channelMaxRestartsPerHour: z.number().int().min(1).optional(),
|
||||
tailscale: z
|
||||
.object({
|
||||
mode: z.union([z.literal("off"), z.literal("serve"), z.literal("funnel")]).optional(),
|
||||
|
|
|
|||
|
|
@ -41,6 +41,16 @@ const BASE_RELOAD_RULES: ReloadRule[] = [
|
|||
kind: "hot",
|
||||
actions: ["restart-health-monitor"],
|
||||
},
|
||||
{
|
||||
prefix: "gateway.channelStaleEventThresholdMinutes",
|
||||
kind: "hot",
|
||||
actions: ["restart-health-monitor"],
|
||||
},
|
||||
{
|
||||
prefix: "gateway.channelMaxRestartsPerHour",
|
||||
kind: "hot",
|
||||
actions: ["restart-health-monitor"],
|
||||
},
|
||||
// Stuck-session warning threshold is read by the diagnostics heartbeat loop.
|
||||
{ prefix: "diagnostics.stuckSessionWarnMs", kind: "none" },
|
||||
{ prefix: "hooks.gmail", kind: "hot", actions: ["restart-gmail-watcher"] },
|
||||
|
|
|
|||
|
|
@ -50,7 +50,11 @@ export function createGatewayReloadHandlers(params: {
|
|||
logChannels: { info: (msg: string) => void; error: (msg: string) => void };
|
||||
logCron: { error: (msg: string) => void };
|
||||
logReload: { info: (msg: string) => void; warn: (msg: string) => void };
|
||||
createHealthMonitor: (checkIntervalMs: number) => ChannelHealthMonitor;
|
||||
createHealthMonitor: (opts: {
|
||||
checkIntervalMs: number;
|
||||
staleEventThresholdMs?: number;
|
||||
maxRestartsPerHour?: number;
|
||||
}) => ChannelHealthMonitor;
|
||||
}) {
|
||||
const applyHotReload = async (
|
||||
plan: GatewayReloadPlan,
|
||||
|
|
@ -101,8 +105,17 @@ export function createGatewayReloadHandlers(params: {
|
|||
if (plan.restartHealthMonitor) {
|
||||
state.channelHealthMonitor?.stop();
|
||||
const minutes = nextConfig.gateway?.channelHealthCheckMinutes;
|
||||
const staleMinutes = nextConfig.gateway?.channelStaleEventThresholdMinutes;
|
||||
nextState.channelHealthMonitor =
|
||||
minutes === 0 ? null : params.createHealthMonitor((minutes ?? 5) * 60_000);
|
||||
minutes === 0
|
||||
? null
|
||||
: params.createHealthMonitor({
|
||||
checkIntervalMs: (minutes ?? 5) * 60_000,
|
||||
...(staleMinutes != null && { staleEventThresholdMs: staleMinutes * 60_000 }),
|
||||
...(nextConfig.gateway?.channelMaxRestartsPerHour != null && {
|
||||
maxRestartsPerHour: nextConfig.gateway.channelMaxRestartsPerHour,
|
||||
}),
|
||||
});
|
||||
}
|
||||
|
||||
if (plan.restartGmailWatcher) {
|
||||
|
|
|
|||
|
|
@ -757,11 +757,17 @@ export async function startGatewayServer(
|
|||
|
||||
const healthCheckMinutes = cfgAtStart.gateway?.channelHealthCheckMinutes;
|
||||
const healthCheckDisabled = healthCheckMinutes === 0;
|
||||
const staleEventThresholdMinutes = cfgAtStart.gateway?.channelStaleEventThresholdMinutes;
|
||||
const maxRestartsPerHour = cfgAtStart.gateway?.channelMaxRestartsPerHour;
|
||||
let channelHealthMonitor = healthCheckDisabled
|
||||
? null
|
||||
: startChannelHealthMonitor({
|
||||
channelManager,
|
||||
checkIntervalMs: (healthCheckMinutes ?? 5) * 60_000,
|
||||
...(staleEventThresholdMinutes != null && {
|
||||
staleEventThresholdMs: staleEventThresholdMinutes * 60_000,
|
||||
}),
|
||||
...(maxRestartsPerHour != null && { maxRestartsPerHour }),
|
||||
});
|
||||
|
||||
if (!minimalTestGateway) {
|
||||
|
|
@ -980,8 +986,21 @@ export async function startGatewayServer(
|
|||
logChannels,
|
||||
logCron,
|
||||
logReload,
|
||||
createHealthMonitor: (checkIntervalMs: number) =>
|
||||
startChannelHealthMonitor({ channelManager, checkIntervalMs }),
|
||||
createHealthMonitor: (opts: {
|
||||
checkIntervalMs: number;
|
||||
staleEventThresholdMs?: number;
|
||||
maxRestartsPerHour?: number;
|
||||
}) =>
|
||||
startChannelHealthMonitor({
|
||||
channelManager,
|
||||
checkIntervalMs: opts.checkIntervalMs,
|
||||
...(opts.staleEventThresholdMs != null && {
|
||||
staleEventThresholdMs: opts.staleEventThresholdMs,
|
||||
}),
|
||||
...(opts.maxRestartsPerHour != null && {
|
||||
maxRestartsPerHour: opts.maxRestartsPerHour,
|
||||
}),
|
||||
}),
|
||||
});
|
||||
|
||||
return startGatewayConfigReloader({
|
||||
|
|
|
|||
Loading…
Reference in New Issue