Fix: Correctly estimate CJK character token count in context pruner (openclaw#39985)

Verified: - pnpm install --frozen-lockfile - pnpm build - pnpm check - pnpm test -- src/agents/pi-extensions/context-pruning.test.ts src/utils/cjk-chars.test.ts Co-authored-by: Edward-Qiang-2024 <176464463+Edward-Qiang-2024@users.noreply.github.com> Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
2026-03-29 10:16:52 +08:00 · 2026-03-29 10:16:52 +08:00 · 1c8758fbd5
parent 7cf87c4e53
commit 1c8758fbd5
3 changed files with 38 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai

 - LINE/ACP: add current-conversation binding and inbound binding-routing parity so `/acp spawn ... --thread here`, configured ACP bindings, and active conversation-bound ACP sessions work on LINE like the other conversation channels.
 - TTS/Microsoft: auto-switch the default Edge voice to Chinese for CJK-dominant text without overriding explicitly selected Microsoft voices. (#52355) Thanks @extrasmall0.
+- Agents/context pruning: count supplementary-plane CJK characters with the shared code-point-aware estimator so context pruning stops underestimating Japanese and Chinese text that uses Extension B ideographs. (#39985) Thanks @Edward-Qiang-2024.
 - macOS/local gateway: stop OpenClaw.app from killing healthy local gateway listeners after startup by recognizing the current `openclaw-gateway` process title and using the current `openclaw gateway` launch shape.
 - Memory/QMD: resolve slugified `memory_search` file hints back to the indexed filesystem path before returning search hits, so `memory_get` works again for mixed-case and spaced paths. (#50313) Thanks @erra9x.
 - Memory/QMD: weight CJK-heavy text correctly when estimating chunk sizes, preserve surrogate-pair characters during fine splits, and keep long Latin lines on the old chunk boundaries so memory indexing produces better-sized chunks for CJK notes. (#40271) Thanks @AaronLuo00.
--- a/src/agents/pi-extensions/context-pruning.test.ts
+++ b/src/agents/pi-extensions/context-pruning.test.ts
@ -269,6 +269,34 @@ describe("context-pruning", () => {
    expect(toolText(findToolResult(next, "t3"))).toContain("z".repeat(20_000));
  });

+  it("accounts for CJK Extension B text when deciding whether to prune", () => {
+    const extensionBText = "𠀀".repeat(50);
+    const messages: AgentMessage[] = [
+      makeUser(extensionBText),
+      makeToolResult({
+        toolCallId: "t1",
+        toolName: "exec",
+        text: "keep me",
+      }),
+    ];
+
+    const next = pruneContextMessages({
+      messages,
+      settings: makeAggressiveSettings({
+        keepLastAssistants: 0,
+        softTrimRatio: 1,
+        hardClearRatio: 1,
+        minPrunableToolChars: 0,
+        hardClear: { enabled: true, placeholder: "[cleared]" },
+      }),
+      ctx: CONTEXT_WINDOW_1000,
+      contextWindowTokensOverride: 40,
+      isToolPrunable: () => true,
+    });
+
+    expect(toolText(findToolResult(next, "t1"))).toBe("[cleared]");
+  });
+
  it("uses contextWindow override when ctx.model is missing", () => {
    const messages = makeSimpleToolPruningMessages(true);

--- a/src/agents/pi-extensions/context-pruning/pruner.ts
+++ b/src/agents/pi-extensions/context-pruning/pruner.ts
@ -1,10 +1,10 @@
 import type { AgentMessage } from "@mariozechner/pi-agent-core";
 import type { ImageContent, TextContent, ToolResultMessage } from "@mariozechner/pi-ai";
 import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
+import { CHARS_PER_TOKEN_ESTIMATE, estimateStringChars } from "../../../utils/cjk-chars.js";
 import type { EffectiveContextPruningSettings } from "./settings.js";
 import { makeToolPrunablePredicate } from "./tools.js";

-const CHARS_PER_TOKEN_ESTIMATE = 4;
 const IMAGE_CHAR_ESTIMATE = 8_000;
 const PRUNED_CONTEXT_IMAGE_MARKER = "[image removed during context pruning]";

@ -111,11 +111,15 @@ function hasImageBlocks(content: ReadonlyArray<TextContent | ImageContent>): boo
  return false;
 }

+function estimateWeightedTextChars(text: string): number {
+  return estimateStringChars(text);
+}
+
 function estimateTextAndImageChars(content: ReadonlyArray<TextContent | ImageContent>): number {
  let chars = 0;
  for (const block of content) {
    if (block.type === "text") {
-      chars += block.text.length;
+      chars += estimateWeightedTextChars(block.text);
    }
    if (block.type === "image") {
      chars += IMAGE_CHAR_ESTIMATE;
@ -128,7 +132,7 @@ function estimateMessageChars(message: AgentMessage): number {
  if (message.role === "user") {
    const content = message.content;
    if (typeof content === "string") {
-      return content.length;
+      return estimateWeightedTextChars(content);
    }
    return estimateTextAndImageChars(content);
  }
@ -140,10 +144,10 @@ function estimateMessageChars(message: AgentMessage): number {
        continue;
      }
      if (b.type === "text" && typeof b.text === "string") {
-        chars += b.text.length;
+        chars += estimateWeightedTextChars(b.text);
      }
      if (b.type === "thinking" && typeof b.thinking === "string") {
-        chars += b.thinking.length;
+        chars += estimateWeightedTextChars(b.thinking);
      }
      if (b.type === "toolCall") {
        try {