fix(infra): classify SQLite transient errors as non-fatal

SQLITE_CANTOPEN, SQLITE_BUSY, SQLITE_LOCKED, SQLITE_IOERR were not in
any safe-list and caused process.exit(1), killing LaunchAgent gateways.
Add transient SQLite classification alongside existing network errors.

Closes #34678

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Br1an67 2026-03-15 01:50:12 +08:00
parent ba6064cc22
commit 0e86bf00a4
2 changed files with 128 additions and 14 deletions

View File

@ -1,5 +1,9 @@
import { describe, expect, it } from "vitest"; import { describe, expect, it } from "vitest";
import { isAbortError, isTransientNetworkError } from "./unhandled-rejections.js"; import {
isAbortError,
isTransientNetworkError,
isTransientSqliteError,
} from "./unhandled-rejections.js";
describe("isAbortError", () => { describe("isAbortError", () => {
it("returns true for error with name AbortError", () => { it("returns true for error with name AbortError", () => {
@ -187,3 +191,59 @@ describe("isTransientNetworkError", () => {
expect(isTransientNetworkError(error)).toBe(false); expect(isTransientNetworkError(error)).toBe(false);
}); });
}); });
describe("isTransientSqliteError", () => {
it("returns true for errors with transient SQLite codes", () => {
const codes = [
"SQLITE_CANTOPEN",
"SQLITE_BUSY",
"SQLITE_LOCKED",
"SQLITE_IOERR_LOCK",
"SQLITE_IOERR_SHORT_READ",
"SQLITE_IOERR_BLOCKED",
];
for (const code of codes) {
const error = Object.assign(new Error("test"), { code });
expect(isTransientSqliteError(error), `code: ${code}`).toBe(true);
}
});
it("returns false for broad SQLITE_IOERR base code", () => {
const error = Object.assign(new Error("test"), { code: "SQLITE_IOERR" });
expect(isTransientSqliteError(error)).toBe(false);
});
it("returns false for permanent SQLITE_IOERR subtypes", () => {
const permanentCodes = ["SQLITE_IOERR_NOMEM", "SQLITE_IOERR_ACCESS", "SQLITE_IOERR_WRITE"];
for (const code of permanentCodes) {
const error = Object.assign(new Error("test"), { code });
expect(isTransientSqliteError(error), `code: ${code}`).toBe(false);
}
});
it("returns true for SQLite error nested in cause chain", () => {
const innerCause = Object.assign(new Error("database is locked"), { code: "SQLITE_BUSY" });
const error = Object.assign(new Error("wrapper"), { cause: innerCause });
expect(isTransientSqliteError(error)).toBe(true);
});
it("returns false for non-SQLite errors", () => {
expect(isTransientSqliteError(new Error("Something went wrong"))).toBe(false);
expect(isTransientSqliteError(Object.assign(new Error("test"), { code: "ECONNRESET" }))).toBe(
false,
);
});
it("returns false for non-transient SQLite errors", () => {
const error = Object.assign(new Error("test"), { code: "SQLITE_CORRUPT" });
expect(isTransientSqliteError(error)).toBe(false);
});
it.each([null, undefined, "string error", 42, { message: "plain object" }])(
"returns false for non-SQLite input %#",
(value) => {
expect(isTransientSqliteError(value)).toBe(false);
},
);
});

View File

@ -20,6 +20,19 @@ const FATAL_ERROR_CODES = new Set([
const CONFIG_ERROR_CODES = new Set(["INVALID_CONFIG", "MISSING_API_KEY", "MISSING_CREDENTIALS"]); const CONFIG_ERROR_CODES = new Set(["INVALID_CONFIG", "MISSING_API_KEY", "MISSING_CREDENTIALS"]);
// SQLite error codes that indicate transient failures (shouldn't crash the gateway).
// Note: we intentionally do NOT include the broad SQLITE_IOERR base code here because
// many IO-error subtypes (e.g. SQLITE_IOERR_NOMEM, SQLITE_IOERR_ACCESS) are permanent.
// Only specific transient IO-error subtypes are listed.
const TRANSIENT_SQLITE_CODES = new Set([
"SQLITE_CANTOPEN",
"SQLITE_BUSY",
"SQLITE_LOCKED",
"SQLITE_IOERR_LOCK",
"SQLITE_IOERR_SHORT_READ",
"SQLITE_IOERR_BLOCKED",
]);
// Network error codes that indicate transient failures (shouldn't crash the gateway) // Network error codes that indicate transient failures (shouldn't crash the gateway)
const TRANSIENT_NETWORK_CODES = new Set([ const TRANSIENT_NETWORK_CODES = new Set([
"ECONNRESET", "ECONNRESET",
@ -112,6 +125,21 @@ function extractErrorCodeWithCause(err: unknown): string | undefined {
return extractErrorCode(getErrorCause(err)); return extractErrorCode(getErrorCause(err));
} }
/** Shared callback for {@link collectErrorGraphCandidates} used by both SQLite and network checks. */
function collectNestedErrorSources(current: Record<string, unknown>): Array<unknown> {
const nested: Array<unknown> = [
current.cause,
current.reason,
current.original,
current.error,
current.data,
];
if (Array.isArray(current.errors)) {
nested.push(...current.errors);
}
return nested;
}
/** /**
* Checks if an error is an AbortError. * Checks if an error is an AbortError.
* These are typically intentional cancellations (e.g., during shutdown) and shouldn't crash. * These are typically intentional cancellations (e.g., during shutdown) and shouldn't crash.
@ -142,6 +170,39 @@ function isConfigError(err: unknown): boolean {
return code !== undefined && CONFIG_ERROR_CODES.has(code); return code !== undefined && CONFIG_ERROR_CODES.has(code);
} }
/**
* Checks if an error is a transient SQLite error that shouldn't crash the gateway.
* These are typically temporary I/O or locking issues (e.g., running as a LaunchAgent on macOS).
*/
export function isTransientSqliteError(err: unknown): boolean {
if (!err) {
return false;
}
for (const candidate of collectErrorGraphCandidates(err, collectNestedErrorSources)) {
const code = extractErrorCodeOrErrno(candidate);
if (code && TRANSIENT_SQLITE_CODES.has(code)) {
return true;
}
// node:sqlite surfaces errors as code: ERR_SQLITE_ERROR with transient
// details only in the message text. Match the code first, then inspect
// the message for known transient patterns.
if (
code === "ERR_SQLITE_ERROR" ||
(candidate && typeof candidate === "object" && "message" in candidate)
) {
const msg = String((candidate as { message: unknown }).message).toLowerCase();
if (
msg.includes("database is locked") ||
msg.includes("database is busy") ||
msg.includes("unable to open database")
) {
return true;
}
}
}
return false;
}
/** /**
* Checks if an error is a transient network error that shouldn't crash the gateway. * Checks if an error is a transient network error that shouldn't crash the gateway.
* These are typically temporary connectivity issues that will resolve on their own. * These are typically temporary connectivity issues that will resolve on their own.
@ -150,19 +211,7 @@ export function isTransientNetworkError(err: unknown): boolean {
if (!err) { if (!err) {
return false; return false;
} }
for (const candidate of collectErrorGraphCandidates(err, (current) => { for (const candidate of collectErrorGraphCandidates(err, collectNestedErrorSources)) {
const nested: Array<unknown> = [
current.cause,
current.reason,
current.original,
current.error,
current.data,
];
if (Array.isArray(current.errors)) {
nested.push(...current.errors);
}
return nested;
})) {
const code = extractErrorCodeOrErrno(candidate); const code = extractErrorCodeOrErrno(candidate);
if (code && TRANSIENT_NETWORK_CODES.has(code)) { if (code && TRANSIENT_NETWORK_CODES.has(code)) {
return true; return true;
@ -251,6 +300,11 @@ export function installUnhandledRejectionHandler(): void {
return; return;
} }
if (isTransientSqliteError(reason)) {
console.warn("[openclaw] Non-fatal SQLite error (continuing):", formatUncaughtError(reason));
return;
}
console.error("[openclaw] Unhandled promise rejection:", formatUncaughtError(reason)); console.error("[openclaw] Unhandled promise rejection:", formatUncaughtError(reason));
process.exit(1); process.exit(1);
}); });