fix(memory): stagger qmd embed maintenance across agents (#58180)

* fix(memory): stagger qmd embed maintenance across agents

* fix(memory): keep qmd embed serialization in-process

* fix(memory): extend qmd embed lock wait budget
This commit is contained in:
Vincent Koc 2026-03-31 17:17:20 +09:00 committed by GitHub
parent bf6f506dfa
commit d2dcd6fca6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 236 additions and 22 deletions

View File

@ -59,6 +59,7 @@ Docs: https://docs.openclaw.ai
- macOS/local gateway: stop OpenClaw.app from killing healthy local gateway listeners after startup by recognizing the current `openclaw-gateway` process title and using the current `openclaw gateway` launch shape.
- Gateway/OpenAI compatibility: accept flat Responses API function tool definitions on `/v1/responses` and preserve `strict` when normalizing hosted tools into the embedded runner, so spec-compliant clients like Codex no longer fail validation or silently lose strict tool enforcement. Thanks @malaiwah and @vincentkoc.
- Memory/QMD: resolve slugified `memory_search` file hints back to the indexed filesystem path before returning search hits, so `memory_get` works again for mixed-case and spaced paths. (#50313) Thanks @erra9x.
- Memory/QMD: serialize cross-process `qmd embed` runs behind a shared lock and stagger periodic embed timers so multi-agent QMD collections stop thundering-herding on startup and every maintenance interval. Thanks @vincentkoc.
- OpenAI/Codex fast mode: map `/fast` to priority processing on native OpenAI and Codex Responses endpoints instead of rewriting reasoning settings, and document the exact endpoint and override behavior.
- Memory/QMD: weight CJK-heavy text correctly when estimating chunk sizes, preserve surrogate-pair characters during fine splits, and keep long Latin lines on the old chunk boundaries so memory indexing produces better-sized chunks for CJK notes. (#40271) Thanks @AaronLuo00.
- Security/LINE: make webhook signature validation run the timing-safe compare even when the supplied signature length is wrong, closing a small timing side-channel. (#55663) Thanks @gavyngong.

View File

@ -20,7 +20,13 @@ const { watchMock } = vi.hoisted(() => ({
});
}),
}));
const { withFileLockMock } = vi.hoisted(() => ({
withFileLockMock: vi.fn(
async <T>(_filePath: string, _options: unknown, fn: () => Promise<T>) => await fn(),
),
}));
const MCPORTER_STATE_KEY = Symbol.for("openclaw.mcporterState");
const QMD_EMBED_QUEUE_KEY = Symbol.for("openclaw.qmdEmbedQueueTail");
type MockChild = EventEmitter & {
stdout: EventEmitter;
@ -106,6 +112,14 @@ vi.mock("chokidar", () => ({
watch: watchMock,
}));
vi.mock("openclaw/plugin-sdk/file-lock", async (importOriginal) => {
const actual = await importOriginal<typeof import("openclaw/plugin-sdk/file-lock")>();
return {
...actual,
withFileLock: withFileLockMock,
};
});
import { spawn as mockedSpawn } from "node:child_process";
import type { OpenClawConfig } from "openclaw/plugin-sdk/memory-core-host-engine-foundation";
import {
@ -128,6 +142,7 @@ describe("QmdMemoryManager", () => {
let cfg: OpenClawConfig;
const agentId = "main";
const openManagers = new Set<QmdMemoryManager>();
let embedStartupJitterSpy: ReturnType<typeof vi.spyOn> | null = null;
function trackManager<T extends QmdMemoryManager | null>(manager: T): T {
if (manager) {
@ -166,6 +181,7 @@ describe("QmdMemoryManager", () => {
spawnMock.mockClear();
spawnMock.mockImplementation(() => createMockChild());
watchMock.mockClear();
withFileLockMock.mockClear();
logWarnMock.mockClear();
logDebugMock.mockClear();
logInfoMock.mockClear();
@ -202,6 +218,14 @@ describe("QmdMemoryManager", () => {
},
},
} as OpenClawConfig;
embedStartupJitterSpy = vi
.spyOn(
QmdMemoryManager.prototype as unknown as {
resolveEmbedStartupJitterMs: () => number;
},
"resolveEmbedStartupJitterMs",
)
.mockReturnValue(0);
});
afterEach(async () => {
@ -212,6 +236,8 @@ describe("QmdMemoryManager", () => {
);
openManagers.clear();
await fs.rm(tmpRoot, { recursive: true, force: true });
embedStartupJitterSpy?.mockRestore();
embedStartupJitterSpy = null;
vi.useRealTimers();
delete process.env.OPENCLAW_STATE_DIR;
if (originalPath === undefined) {
@ -230,6 +256,7 @@ describe("QmdMemoryManager", () => {
(process.env as NodeJS.ProcessEnv & { Path?: string }).Path = originalWindowsPath;
}
delete (globalThis as Record<PropertyKey, unknown>)[MCPORTER_STATE_KEY];
delete (globalThis as Record<PropertyKey, unknown>)[QMD_EMBED_QUEUE_KEY];
});
it("debounces back-to-back sync calls", async () => {
@ -3112,6 +3139,102 @@ describe("QmdMemoryManager", () => {
await manager.close();
});
it("delays the first periodic embed maintenance run by stable startup jitter", async () => {
vi.useFakeTimers();
embedStartupJitterSpy?.mockRestore();
embedStartupJitterSpy = vi
.spyOn(
QmdMemoryManager.prototype as unknown as {
resolveEmbedStartupJitterMs: () => number;
},
"resolveEmbedStartupJitterMs",
)
.mockReturnValue(60_000);
cfg = {
...cfg,
memory: {
backend: "qmd",
qmd: {
includeDefaultMemory: false,
searchMode: "query",
update: {
interval: "0s",
debounceMs: 0,
onBoot: false,
embedInterval: "5m",
},
paths: [{ path: workspaceDir, pattern: "**/*.md", name: "workspace" }],
},
},
} as OpenClawConfig;
const { manager } = await createManager({ mode: "full" });
await vi.advanceTimersByTimeAsync(59_999);
const beforeCalls = spawnMock.mock.calls
.map((call: unknown[]) => call[1] as string[])
.filter((args: string[]) => args[0] === "update" || args[0] === "embed");
expect(beforeCalls).toHaveLength(0);
await vi.advanceTimersByTimeAsync(1);
const commandCalls = spawnMock.mock.calls
.map((call: unknown[]) => call[1] as string[])
.filter((args: string[]) => args[0] === "update" || args[0] === "embed");
expect(commandCalls).toEqual([["update"], ["embed"]]);
await manager.close();
});
it("serializes qmd embeds within a process before taking the shared file lock", async () => {
vi.useFakeTimers();
const embedChildren: MockChild[] = [];
spawnMock.mockImplementation((_cmd: string, args: string[]) => {
if (args[0] === "embed") {
const child = createMockChild({ autoClose: false });
embedChildren.push(child);
return child;
}
return createMockChild();
});
const first = await createManager({ mode: "status" });
const second = await createManager({ mode: "status" });
const firstSync = first.manager.sync({ reason: "manual", force: true });
await vi.advanceTimersByTimeAsync(0);
expect(embedChildren).toHaveLength(1);
expect(withFileLockMock).toHaveBeenCalledWith(
expect.any(String),
expect.objectContaining({
retries: expect.objectContaining({
retries: expect.any(Number),
maxTimeout: 10_000,
}),
stale: expect.any(Number),
}),
expect.any(Function),
);
const lockOptions = withFileLockMock.mock.calls[0]?.[1] as {
retries: { retries: number };
stale: number;
};
expect(lockOptions.retries.retries).toBeGreaterThanOrEqual(90);
expect(lockOptions.stale).toBeGreaterThanOrEqual(15 * 60 * 1000);
const secondSync = second.manager.sync({ reason: "manual", force: true });
await vi.advanceTimersByTimeAsync(0);
expect(embedChildren).toHaveLength(1);
embedChildren[0]?.closeWith(0);
await vi.advanceTimersByTimeAsync(0);
expect(embedChildren).toHaveLength(2);
embedChildren[1]?.closeWith(0);
await expect(firstSync).resolves.toBeUndefined();
await expect(secondSync).resolves.toBeUndefined();
await first.manager.close();
await second.manager.close();
});
it("runs qmd embed in search mode for forced sync", async () => {
cfg = {
...cfg,

View File

@ -1,8 +1,10 @@
import crypto from "node:crypto";
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import readline from "node:readline";
import chokidar, { type FSWatcher } from "chokidar";
import { withFileLock } from "openclaw/plugin-sdk/file-lock";
import {
createSubsystemLogger,
resolveMemorySearchConfig,
@ -52,7 +54,15 @@ const NUL_MARKER_RE = /(?:\^@|\\0|\\x00|\\u0000|null\s*byte|nul\s*byte)/i;
const QMD_EMBED_BACKOFF_BASE_MS = 60_000;
const QMD_EMBED_BACKOFF_MAX_MS = 60 * 60 * 1000;
const HAN_SCRIPT_RE = /[\u3400-\u9fff]/u;
const QMD_EMBED_LOCK_MIN_WAIT_MS = 15 * 60 * 1000;
const QMD_EMBED_LOCK_RETRY_TEMPLATE = {
factor: 1.2,
minTimeout: 250,
maxTimeout: 10_000,
randomize: true,
} as const;
const MCPORTER_STATE_KEY = Symbol.for("openclaw.mcporterState");
const QMD_EMBED_QUEUE_KEY = Symbol.for("openclaw.qmdEmbedQueueTail");
const IGNORED_MEMORY_WATCH_DIR_NAMES = new Set([
".git",
"node_modules",
@ -68,7 +78,9 @@ type McporterState = {
daemonStart: Promise<void> | null;
};
let qmdEmbedQueueTail: Promise<void> = Promise.resolve();
type QmdEmbedQueueState = {
tail: Promise<void>;
};
function getMcporterState(): McporterState {
return resolveGlobalSingleton<McporterState>(MCPORTER_STATE_KEY, () => ({
@ -77,6 +89,12 @@ function getMcporterState(): McporterState {
}));
}
function getQmdEmbedQueueState(): QmdEmbedQueueState {
return resolveGlobalSingleton<QmdEmbedQueueState>(QMD_EMBED_QUEUE_KEY, () => ({
tail: Promise.resolve(),
}));
}
function hasHanScript(value: string): boolean {
return HAN_SCRIPT_RE.test(value);
}
@ -87,26 +105,33 @@ function normalizeHanBm25Query(query: string): string {
return trimmed;
}
function resolveStableJitterMs(params: { seed: string; windowMs: number }): number {
if (params.windowMs <= 0) {
return 0;
}
const hash = crypto.createHash("sha256").update(params.seed).digest();
const bucket = hash.readUInt32BE(0);
return bucket % (Math.floor(params.windowMs) + 1);
}
function resolveQmdEmbedLockOptions(embedTimeoutMs: number) {
const expectedEmbedMs = Math.max(1, embedTimeoutMs);
const waitBudgetMs = Math.max(QMD_EMBED_LOCK_MIN_WAIT_MS, expectedEmbedMs * 6);
return {
retries: {
retries: Math.max(60, Math.ceil(waitBudgetMs / QMD_EMBED_LOCK_RETRY_TEMPLATE.maxTimeout)),
...QMD_EMBED_LOCK_RETRY_TEMPLATE,
},
stale: Math.max(QMD_EMBED_LOCK_MIN_WAIT_MS, expectedEmbedMs * 2),
};
}
function shouldIgnoreMemoryWatchPath(watchPath: string): boolean {
const normalized = path.normalize(watchPath);
const parts = normalized.split(path.sep).map((segment) => segment.trim().toLowerCase());
return parts.some((segment) => IGNORED_MEMORY_WATCH_DIR_NAMES.has(segment));
}
async function runWithQmdEmbedLock<T>(task: () => Promise<T>): Promise<T> {
const previous = qmdEmbedQueueTail;
let release: (() => void) | undefined;
qmdEmbedQueueTail = new Promise<void>((resolve) => {
release = resolve;
});
await previous.catch(() => undefined);
try {
return await task();
} finally {
release?.();
}
}
type CollectionRoot = {
path: string;
kind: MemorySource;
@ -337,11 +362,33 @@ export class QmdMemoryManager implements MemorySearchManager {
}, this.qmd.update.intervalMs);
}
if (this.shouldScheduleEmbedTimer()) {
this.embedTimer = setInterval(() => {
void this.runUpdate("embed-interval").catch((err) => {
log.warn(`qmd embed interval update failed (${String(err)})`);
});
}, this.qmd.update.embedIntervalMs);
const startPeriodicEmbedTimer = () => {
this.embedTimer = setInterval(() => {
void this.runUpdate("embed-interval").catch((err) => {
log.warn(`qmd embed interval update failed (${String(err)})`);
});
}, this.qmd.update.embedIntervalMs);
};
const initialDelayMs = this.resolveEmbedStartupJitterMs();
if (initialDelayMs > 0) {
this.embedTimer = setTimeout(() => {
this.embedTimer = null;
if (this.closed) {
return;
}
void this.runUpdate("embed-interval")
.catch((err) => {
log.warn(`qmd embed interval update failed (${String(err)})`);
})
.finally(() => {
if (!this.closed) {
startPeriodicEmbedTimer();
}
});
}, initialDelayMs);
} else {
startPeriodicEmbedTimer();
}
}
}
@ -1085,7 +1132,7 @@ export class QmdMemoryManager implements MemorySearchManager {
this.updateTimer = null;
}
if (this.embedTimer) {
clearInterval(this.embedTimer);
clearTimeout(this.embedTimer);
this.embedTimer = null;
}
if (this.watchTimer) {
@ -1136,7 +1183,7 @@ export class QmdMemoryManager implements MemorySearchManager {
this.dirty = false;
if (this.shouldRunEmbed(force)) {
try {
await runWithQmdEmbedLock(async () => {
await this.withQmdEmbedLock(async () => {
await this.runQmd(["embed"], {
timeoutMs: this.qmd.update.embedTimeoutMs,
discardOutput: true,
@ -1303,6 +1350,49 @@ export class QmdMemoryManager implements MemorySearchManager {
return updateIntervalMs <= 0 || updateIntervalMs > embedIntervalMs;
}
private resolveEmbedStartupJitterMs(): number {
const windowMs = this.qmd.update.embedIntervalMs;
if (windowMs <= 0) {
return 0;
}
const customCollections = this.qmd.collections
.filter((collection) => collection.kind === "custom")
.map((collection) => `${collection.path}\u0000${collection.pattern}`)
.toSorted()
.join("\u0001");
if (!customCollections) {
return 0;
}
return resolveStableJitterMs({
seed: `${this.agentId}:${customCollections}`,
windowMs,
});
}
private async withQmdEmbedLock<T>(task: () => Promise<T>): Promise<T> {
const lockPath = path.join(this.stateDir, "qmd", "embed.lock");
const queue = getQmdEmbedQueueState();
const previous = queue.tail;
let releaseCurrent!: () => void;
const current = new Promise<void>((resolve) => {
releaseCurrent = resolve;
});
queue.tail = previous.then(
() => current,
() => current,
);
await previous.catch(() => undefined);
try {
return await withFileLock(
lockPath,
resolveQmdEmbedLockOptions(this.qmd.update.embedTimeoutMs),
task,
);
} finally {
releaseCurrent();
}
}
private noteEmbedFailure(reason: string, err: unknown): void {
this.embedFailureCount += 1;
const delayMs = Math.min(