From 1c8758fbd51d2af197834dc2afdef2bac88f2b93 Mon Sep 17 00:00:00 2001 From: Edward-Qiang-2024 Date: Sun, 29 Mar 2026 10:16:52 +0800 Subject: [PATCH] Fix: Correctly estimate CJK character token count in context pruner (openclaw#39985) Verified: - pnpm install --frozen-lockfile - pnpm build - pnpm check - pnpm test -- src/agents/pi-extensions/context-pruning.test.ts src/utils/cjk-chars.test.ts Co-authored-by: Edward-Qiang-2024 <176464463+Edward-Qiang-2024@users.noreply.github.com> Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com> --- CHANGELOG.md | 1 + .../pi-extensions/context-pruning.test.ts | 28 +++++++++++++++++++ .../pi-extensions/context-pruning/pruner.ts | 14 ++++++---- 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8efc07164cc..dffc3f39c6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai - LINE/ACP: add current-conversation binding and inbound binding-routing parity so `/acp spawn ... --thread here`, configured ACP bindings, and active conversation-bound ACP sessions work on LINE like the other conversation channels. - TTS/Microsoft: auto-switch the default Edge voice to Chinese for CJK-dominant text without overriding explicitly selected Microsoft voices. (#52355) Thanks @extrasmall0. +- Agents/context pruning: count supplementary-plane CJK characters with the shared code-point-aware estimator so context pruning stops underestimating Japanese and Chinese text that uses Extension B ideographs. (#39985) Thanks @Edward-Qiang-2024. - macOS/local gateway: stop OpenClaw.app from killing healthy local gateway listeners after startup by recognizing the current `openclaw-gateway` process title and using the current `openclaw gateway` launch shape. - Memory/QMD: resolve slugified `memory_search` file hints back to the indexed filesystem path before returning search hits, so `memory_get` works again for mixed-case and spaced paths. (#50313) Thanks @erra9x. - Memory/QMD: weight CJK-heavy text correctly when estimating chunk sizes, preserve surrogate-pair characters during fine splits, and keep long Latin lines on the old chunk boundaries so memory indexing produces better-sized chunks for CJK notes. (#40271) Thanks @AaronLuo00. diff --git a/src/agents/pi-extensions/context-pruning.test.ts b/src/agents/pi-extensions/context-pruning.test.ts index 9dedff97def..76f39a60f08 100644 --- a/src/agents/pi-extensions/context-pruning.test.ts +++ b/src/agents/pi-extensions/context-pruning.test.ts @@ -269,6 +269,34 @@ describe("context-pruning", () => { expect(toolText(findToolResult(next, "t3"))).toContain("z".repeat(20_000)); }); + it("accounts for CJK Extension B text when deciding whether to prune", () => { + const extensionBText = "𠀀".repeat(50); + const messages: AgentMessage[] = [ + makeUser(extensionBText), + makeToolResult({ + toolCallId: "t1", + toolName: "exec", + text: "keep me", + }), + ]; + + const next = pruneContextMessages({ + messages, + settings: makeAggressiveSettings({ + keepLastAssistants: 0, + softTrimRatio: 1, + hardClearRatio: 1, + minPrunableToolChars: 0, + hardClear: { enabled: true, placeholder: "[cleared]" }, + }), + ctx: CONTEXT_WINDOW_1000, + contextWindowTokensOverride: 40, + isToolPrunable: () => true, + }); + + expect(toolText(findToolResult(next, "t1"))).toBe("[cleared]"); + }); + it("uses contextWindow override when ctx.model is missing", () => { const messages = makeSimpleToolPruningMessages(true); diff --git a/src/agents/pi-extensions/context-pruning/pruner.ts b/src/agents/pi-extensions/context-pruning/pruner.ts index a0f4458f6d4..55a9da89b21 100644 --- a/src/agents/pi-extensions/context-pruning/pruner.ts +++ b/src/agents/pi-extensions/context-pruning/pruner.ts @@ -1,10 +1,10 @@ import type { AgentMessage } from "@mariozechner/pi-agent-core"; import type { ImageContent, TextContent, ToolResultMessage } from "@mariozechner/pi-ai"; import type { ExtensionContext } from "@mariozechner/pi-coding-agent"; +import { CHARS_PER_TOKEN_ESTIMATE, estimateStringChars } from "../../../utils/cjk-chars.js"; import type { EffectiveContextPruningSettings } from "./settings.js"; import { makeToolPrunablePredicate } from "./tools.js"; -const CHARS_PER_TOKEN_ESTIMATE = 4; const IMAGE_CHAR_ESTIMATE = 8_000; const PRUNED_CONTEXT_IMAGE_MARKER = "[image removed during context pruning]"; @@ -111,11 +111,15 @@ function hasImageBlocks(content: ReadonlyArray): boo return false; } +function estimateWeightedTextChars(text: string): number { + return estimateStringChars(text); +} + function estimateTextAndImageChars(content: ReadonlyArray): number { let chars = 0; for (const block of content) { if (block.type === "text") { - chars += block.text.length; + chars += estimateWeightedTextChars(block.text); } if (block.type === "image") { chars += IMAGE_CHAR_ESTIMATE; @@ -128,7 +132,7 @@ function estimateMessageChars(message: AgentMessage): number { if (message.role === "user") { const content = message.content; if (typeof content === "string") { - return content.length; + return estimateWeightedTextChars(content); } return estimateTextAndImageChars(content); } @@ -140,10 +144,10 @@ function estimateMessageChars(message: AgentMessage): number { continue; } if (b.type === "text" && typeof b.text === "string") { - chars += b.text.length; + chars += estimateWeightedTextChars(b.text); } if (b.type === "thinking" && typeof b.thinking === "string") { - chars += b.thinking.length; + chars += estimateWeightedTextChars(b.thinking); } if (b.type === "toolCall") { try {