diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ab99495642..d1284d9c538 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -147,6 +147,7 @@ Docs: https://docs.openclaw.ai - Config/SecretRef + Control UI: harden SecretRef redaction round-trip restore, block unsafe raw fallback (force Form mode when raw is unavailable), and preflight submitted-config SecretRefs before config write RPC persistence. (#58044) Thanks @joshavant. - Config/Telegram: migrate removed `channels.telegram.groupMentionsOnly` into `channels.telegram.groups["*"].requireMention` on load so legacy configs no longer crash at startup. (#55336) thanks @jameslcowan. - Gateway/SecretRef: resolve restart token drift checks with merged service/runtime env sources and hard-fail unsupported mutable SecretRef plus OAuth-profile combinations so restart warnings and policy enforcement match runtime behavior. (#58141) Thanks @joshavant. +- Voice Call/media stream: cap inbound WebSocket frame size before `start` validation so oversized pre-start frames are dropped before JSON parsing. Thanks @Kazamayc and @vincentkoc. - Pairing: enforce pending request limits per account instead of per shared channel queue, so one account's outstanding pairing challenges no longer block new pairing on other accounts. Thanks @smaeljaish771 and @vincentkoc. - Exec approvals: unwrap `caffeinate` and `sandbox-exec` before persisting allow-always trust so later shell payload changes still require a fresh approval. Thanks @tdjackey and @vincentkoc. diff --git a/extensions/voice-call/src/media-stream.test.ts b/extensions/voice-call/src/media-stream.test.ts index f1e564c0134..8f6e16bc5e8 100644 --- a/extensions/voice-call/src/media-stream.test.ts +++ b/extensions/voice-call/src/media-stream.test.ts @@ -344,4 +344,39 @@ describe("MediaStreamHandler security hardening", () => { await server.close(); } }); + + it("rejects oversized pre-start frames at the websocket maxPayload guard before validation runs", async () => { + const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> = + []; + const handler = new MediaStreamHandler({ + sttProvider: createStubSttProvider(), + preStartTimeoutMs: 1_000, + shouldAcceptStream: (params) => { + shouldAcceptStreamCalls.push(params); + return true; + }, + }); + const server = await startWsServer(handler); + + try { + const ws = await connectWs(server.url); + ws.send( + JSON.stringify({ + event: "start", + streamSid: "MZ-oversized", + start: { + callSid: "CA-oversized", + customParameters: { token: "token-oversized", padding: "A".repeat(256 * 1024) }, + }, + }), + ); + + const closed = await waitForClose(ws); + + expect(closed.code).toBe(1009); + expect(shouldAcceptStreamCalls).toEqual([]); + } finally { + await server.close(); + } + }); }); diff --git a/extensions/voice-call/src/media-stream.ts b/extensions/voice-call/src/media-stream.ts index 31f0e9c46c9..fb259d723b8 100644 --- a/extensions/voice-call/src/media-stream.ts +++ b/extensions/voice-call/src/media-stream.ts @@ -9,7 +9,7 @@ import type { IncomingMessage } from "node:http"; import type { Duplex } from "node:stream"; -import { WebSocket, WebSocketServer } from "ws"; +import { type RawData, WebSocket, WebSocketServer } from "ws"; import type { OpenAIRealtimeSTTProvider, RealtimeSTTSession, @@ -76,6 +76,7 @@ const DEFAULT_PRE_START_TIMEOUT_MS = 5000; const DEFAULT_MAX_PENDING_CONNECTIONS = 32; const DEFAULT_MAX_PENDING_CONNECTIONS_PER_IP = 4; const DEFAULT_MAX_CONNECTIONS = 128; +const MAX_INBOUND_MESSAGE_BYTES = 64 * 1024; const MAX_WS_BUFFERED_BYTES = 1024 * 1024; const CLOSE_REASON_LOG_MAX_CHARS = 120; @@ -90,6 +91,16 @@ export function sanitizeLogText(value: string, maxChars: number): string { return `${sanitized.slice(0, maxChars)}...`; } +function normalizeWsMessageData(data: RawData): Buffer { + if (Buffer.isBuffer(data)) { + return data; + } + if (Array.isArray(data)) { + return Buffer.concat(data); + } + return Buffer.from(data); +} + /** * Manages WebSocket connections for Twilio media streams. */ @@ -126,7 +137,11 @@ export class MediaStreamHandler { */ handleUpgrade(request: IncomingMessage, socket: Duplex, head: Buffer): void { if (!this.wss) { - this.wss = new WebSocketServer({ noServer: true }); + this.wss = new WebSocketServer({ + noServer: true, + // Reject oversized frames before app-level parsing runs on unauthenticated sockets. + maxPayload: MAX_INBOUND_MESSAGE_BYTES, + }); this.wss.on("connection", (ws, req) => this.handleConnection(ws, req)); } @@ -154,9 +169,10 @@ export class MediaStreamHandler { return; } - ws.on("message", async (data: Buffer) => { + ws.on("message", async (data: RawData) => { try { - const message = JSON.parse(data.toString()) as TwilioMediaMessage; + const raw = normalizeWsMessageData(data); + const message = JSON.parse(raw.toString("utf8")) as TwilioMediaMessage; switch (message.event) { case "connected":