fix(voice-call): reject oversized pre-start media frames (#58241)

* fix(voice-call): reject oversized pre-start frames

* fix(voice-call): avoid normalizing oversized frames

* chore(changelog): remove stray spacing

* fix(voice-call): remove dead inbound size guard
This commit is contained in:
Vincent Koc 2026-03-31 19:47:10 +09:00 committed by GitHub
parent 9bc1f896c8
commit 9abcfdadf5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 56 additions and 4 deletions

View File

@ -147,6 +147,7 @@ Docs: https://docs.openclaw.ai
- Config/SecretRef + Control UI: harden SecretRef redaction round-trip restore, block unsafe raw fallback (force Form mode when raw is unavailable), and preflight submitted-config SecretRefs before config write RPC persistence. (#58044) Thanks @joshavant.
- Config/Telegram: migrate removed `channels.telegram.groupMentionsOnly` into `channels.telegram.groups["*"].requireMention` on load so legacy configs no longer crash at startup. (#55336) thanks @jameslcowan.
- Gateway/SecretRef: resolve restart token drift checks with merged service/runtime env sources and hard-fail unsupported mutable SecretRef plus OAuth-profile combinations so restart warnings and policy enforcement match runtime behavior. (#58141) Thanks @joshavant.
- Voice Call/media stream: cap inbound WebSocket frame size before `start` validation so oversized pre-start frames are dropped before JSON parsing. Thanks @Kazamayc and @vincentkoc.
- Pairing: enforce pending request limits per account instead of per shared channel queue, so one account's outstanding pairing challenges no longer block new pairing on other accounts. Thanks @smaeljaish771 and @vincentkoc.
- Exec approvals: unwrap `caffeinate` and `sandbox-exec` before persisting allow-always trust so later shell payload changes still require a fresh approval. Thanks @tdjackey and @vincentkoc.

View File

@ -344,4 +344,39 @@ describe("MediaStreamHandler security hardening", () => {
await server.close();
}
});
it("rejects oversized pre-start frames at the websocket maxPayload guard before validation runs", async () => {
const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
[];
const handler = new MediaStreamHandler({
sttProvider: createStubSttProvider(),
preStartTimeoutMs: 1_000,
shouldAcceptStream: (params) => {
shouldAcceptStreamCalls.push(params);
return true;
},
});
const server = await startWsServer(handler);
try {
const ws = await connectWs(server.url);
ws.send(
JSON.stringify({
event: "start",
streamSid: "MZ-oversized",
start: {
callSid: "CA-oversized",
customParameters: { token: "token-oversized", padding: "A".repeat(256 * 1024) },
},
}),
);
const closed = await waitForClose(ws);
expect(closed.code).toBe(1009);
expect(shouldAcceptStreamCalls).toEqual([]);
} finally {
await server.close();
}
});
});

View File

@ -9,7 +9,7 @@
import type { IncomingMessage } from "node:http";
import type { Duplex } from "node:stream";
import { WebSocket, WebSocketServer } from "ws";
import { type RawData, WebSocket, WebSocketServer } from "ws";
import type {
OpenAIRealtimeSTTProvider,
RealtimeSTTSession,
@ -76,6 +76,7 @@ const DEFAULT_PRE_START_TIMEOUT_MS = 5000;
const DEFAULT_MAX_PENDING_CONNECTIONS = 32;
const DEFAULT_MAX_PENDING_CONNECTIONS_PER_IP = 4;
const DEFAULT_MAX_CONNECTIONS = 128;
const MAX_INBOUND_MESSAGE_BYTES = 64 * 1024;
const MAX_WS_BUFFERED_BYTES = 1024 * 1024;
const CLOSE_REASON_LOG_MAX_CHARS = 120;
@ -90,6 +91,16 @@ export function sanitizeLogText(value: string, maxChars: number): string {
return `${sanitized.slice(0, maxChars)}...`;
}
function normalizeWsMessageData(data: RawData): Buffer {
if (Buffer.isBuffer(data)) {
return data;
}
if (Array.isArray(data)) {
return Buffer.concat(data);
}
return Buffer.from(data);
}
/**
* Manages WebSocket connections for Twilio media streams.
*/
@ -126,7 +137,11 @@ export class MediaStreamHandler {
*/
handleUpgrade(request: IncomingMessage, socket: Duplex, head: Buffer): void {
if (!this.wss) {
this.wss = new WebSocketServer({ noServer: true });
this.wss = new WebSocketServer({
noServer: true,
// Reject oversized frames before app-level parsing runs on unauthenticated sockets.
maxPayload: MAX_INBOUND_MESSAGE_BYTES,
});
this.wss.on("connection", (ws, req) => this.handleConnection(ws, req));
}
@ -154,9 +169,10 @@ export class MediaStreamHandler {
return;
}
ws.on("message", async (data: Buffer) => {
ws.on("message", async (data: RawData) => {
try {
const message = JSON.parse(data.toString()) as TwilioMediaMessage;
const raw = normalizeWsMessageData(data);
const message = JSON.parse(raw.toString("utf8")) as TwilioMediaMessage;
switch (message.event) {
case "connected":