fix(voice-call): reject oversized pre-start media frames (#58241)

* fix(voice-call): reject oversized pre-start frames * fix(voice-call): avoid normalizing oversized frames * chore(changelog): remove stray spacing * fix(voice-call): remove dead inbound size guard
2026-03-31 19:47:10 +09:00 · 2026-03-31 19:47:10 +09:00 · 9abcfdadf5
parent 9bc1f896c8
commit 9abcfdadf5
3 changed files with 56 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -147,6 +147,7 @@ Docs: https://docs.openclaw.ai
 - Config/SecretRef + Control UI: harden SecretRef redaction round-trip restore, block unsafe raw fallback (force Form mode when raw is unavailable), and preflight submitted-config SecretRefs before config write RPC persistence. (#58044) Thanks @joshavant.
 - Config/Telegram: migrate removed `channels.telegram.groupMentionsOnly` into `channels.telegram.groups["*"].requireMention` on load so legacy configs no longer crash at startup. (#55336) thanks @jameslcowan.
 - Gateway/SecretRef: resolve restart token drift checks with merged service/runtime env sources and hard-fail unsupported mutable SecretRef plus OAuth-profile combinations so restart warnings and policy enforcement match runtime behavior. (#58141) Thanks @joshavant.
+- Voice Call/media stream: cap inbound WebSocket frame size before `start` validation so oversized pre-start frames are dropped before JSON parsing. Thanks @Kazamayc and @vincentkoc.
 - Pairing: enforce pending request limits per account instead of per shared channel queue, so one account's outstanding pairing challenges no longer block new pairing on other accounts. Thanks @smaeljaish771 and @vincentkoc.
 - Exec approvals: unwrap `caffeinate` and `sandbox-exec` before persisting allow-always trust so later shell payload changes still require a fresh approval. Thanks @tdjackey and @vincentkoc.

--- a/extensions/voice-call/src/media-stream.test.ts
+++ b/extensions/voice-call/src/media-stream.test.ts
@ -344,4 +344,39 @@ describe("MediaStreamHandler security hardening", () => {
      await server.close();
    }
  });
+
+  it("rejects oversized pre-start frames at the websocket maxPayload guard before validation runs", async () => {
+    const shouldAcceptStreamCalls: Array<{ callId: string; streamSid: string; token?: string }> =
+      [];
+    const handler = new MediaStreamHandler({
+      sttProvider: createStubSttProvider(),
+      preStartTimeoutMs: 1_000,
+      shouldAcceptStream: (params) => {
+        shouldAcceptStreamCalls.push(params);
+        return true;
+      },
+    });
+    const server = await startWsServer(handler);
+
+    try {
+      const ws = await connectWs(server.url);
+      ws.send(
+        JSON.stringify({
+          event: "start",
+          streamSid: "MZ-oversized",
+          start: {
+            callSid: "CA-oversized",
+            customParameters: { token: "token-oversized", padding: "A".repeat(256 * 1024) },
+          },
+        }),
+      );
+
+      const closed = await waitForClose(ws);
+
+      expect(closed.code).toBe(1009);
+      expect(shouldAcceptStreamCalls).toEqual([]);
+    } finally {
+      await server.close();
+    }
+  });
 });
--- a/extensions/voice-call/src/media-stream.ts
+++ b/extensions/voice-call/src/media-stream.ts
@ -9,7 +9,7 @@

 import type { IncomingMessage } from "node:http";
 import type { Duplex } from "node:stream";
-import { WebSocket, WebSocketServer } from "ws";
+import { type RawData, WebSocket, WebSocketServer } from "ws";
 import type {
  OpenAIRealtimeSTTProvider,
  RealtimeSTTSession,
@ -76,6 +76,7 @@ const DEFAULT_PRE_START_TIMEOUT_MS = 5000;
 const DEFAULT_MAX_PENDING_CONNECTIONS = 32;
 const DEFAULT_MAX_PENDING_CONNECTIONS_PER_IP = 4;
 const DEFAULT_MAX_CONNECTIONS = 128;
+const MAX_INBOUND_MESSAGE_BYTES = 64 * 1024;
 const MAX_WS_BUFFERED_BYTES = 1024 * 1024;
 const CLOSE_REASON_LOG_MAX_CHARS = 120;

@ -90,6 +91,16 @@ export function sanitizeLogText(value: string, maxChars: number): string {
  return `${sanitized.slice(0, maxChars)}...`;
 }

+function normalizeWsMessageData(data: RawData): Buffer {
+  if (Buffer.isBuffer(data)) {
+    return data;
+  }
+  if (Array.isArray(data)) {
+    return Buffer.concat(data);
+  }
+  return Buffer.from(data);
+}
+
 /**
 * Manages WebSocket connections for Twilio media streams.
 */
@ -126,7 +137,11 @@ export class MediaStreamHandler {
   */
  handleUpgrade(request: IncomingMessage, socket: Duplex, head: Buffer): void {
    if (!this.wss) {
-      this.wss = new WebSocketServer({ noServer: true });
+      this.wss = new WebSocketServer({
+        noServer: true,
+        // Reject oversized frames before app-level parsing runs on unauthenticated sockets.
+        maxPayload: MAX_INBOUND_MESSAGE_BYTES,
+      });
      this.wss.on("connection", (ws, req) => this.handleConnection(ws, req));
    }

@ -154,9 +169,10 @@ export class MediaStreamHandler {
      return;
    }

-    ws.on("message", async (data: Buffer) => {
+    ws.on("message", async (data: RawData) => {
      try {
-        const message = JSON.parse(data.toString()) as TwilioMediaMessage;
+        const raw = normalizeWsMessageData(data);
+        const message = JSON.parse(raw.toString("utf8")) as TwilioMediaMessage;

        switch (message.event) {
          case "connected":