Fix: Correctly estimate CJK character token count in context pruner (openclaw#39985)

Verified:
- pnpm install --frozen-lockfile
- pnpm build
- pnpm check
- pnpm test -- src/agents/pi-extensions/context-pruning.test.ts src/utils/cjk-chars.test.ts

Co-authored-by: Edward-Qiang-2024 <176464463+Edward-Qiang-2024@users.noreply.github.com>
Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
Edward-Qiang-2024 2026-03-29 10:16:52 +08:00 committed by GitHub
parent 7cf87c4e53
commit 1c8758fbd5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 38 additions and 5 deletions

View File

@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai
- LINE/ACP: add current-conversation binding and inbound binding-routing parity so `/acp spawn ... --thread here`, configured ACP bindings, and active conversation-bound ACP sessions work on LINE like the other conversation channels.
- TTS/Microsoft: auto-switch the default Edge voice to Chinese for CJK-dominant text without overriding explicitly selected Microsoft voices. (#52355) Thanks @extrasmall0.
- Agents/context pruning: count supplementary-plane CJK characters with the shared code-point-aware estimator so context pruning stops underestimating Japanese and Chinese text that uses Extension B ideographs. (#39985) Thanks @Edward-Qiang-2024.
- macOS/local gateway: stop OpenClaw.app from killing healthy local gateway listeners after startup by recognizing the current `openclaw-gateway` process title and using the current `openclaw gateway` launch shape.
- Memory/QMD: resolve slugified `memory_search` file hints back to the indexed filesystem path before returning search hits, so `memory_get` works again for mixed-case and spaced paths. (#50313) Thanks @erra9x.
- Memory/QMD: weight CJK-heavy text correctly when estimating chunk sizes, preserve surrogate-pair characters during fine splits, and keep long Latin lines on the old chunk boundaries so memory indexing produces better-sized chunks for CJK notes. (#40271) Thanks @AaronLuo00.

View File

@ -269,6 +269,34 @@ describe("context-pruning", () => {
expect(toolText(findToolResult(next, "t3"))).toContain("z".repeat(20_000));
});
it("accounts for CJK Extension B text when deciding whether to prune", () => {
const extensionBText = "𠀀".repeat(50);
const messages: AgentMessage[] = [
makeUser(extensionBText),
makeToolResult({
toolCallId: "t1",
toolName: "exec",
text: "keep me",
}),
];
const next = pruneContextMessages({
messages,
settings: makeAggressiveSettings({
keepLastAssistants: 0,
softTrimRatio: 1,
hardClearRatio: 1,
minPrunableToolChars: 0,
hardClear: { enabled: true, placeholder: "[cleared]" },
}),
ctx: CONTEXT_WINDOW_1000,
contextWindowTokensOverride: 40,
isToolPrunable: () => true,
});
expect(toolText(findToolResult(next, "t1"))).toBe("[cleared]");
});
it("uses contextWindow override when ctx.model is missing", () => {
const messages = makeSimpleToolPruningMessages(true);

View File

@ -1,10 +1,10 @@
import type { AgentMessage } from "@mariozechner/pi-agent-core";
import type { ImageContent, TextContent, ToolResultMessage } from "@mariozechner/pi-ai";
import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
import { CHARS_PER_TOKEN_ESTIMATE, estimateStringChars } from "../../../utils/cjk-chars.js";
import type { EffectiveContextPruningSettings } from "./settings.js";
import { makeToolPrunablePredicate } from "./tools.js";
const CHARS_PER_TOKEN_ESTIMATE = 4;
const IMAGE_CHAR_ESTIMATE = 8_000;
const PRUNED_CONTEXT_IMAGE_MARKER = "[image removed during context pruning]";
@ -111,11 +111,15 @@ function hasImageBlocks(content: ReadonlyArray<TextContent | ImageContent>): boo
return false;
}
function estimateWeightedTextChars(text: string): number {
return estimateStringChars(text);
}
function estimateTextAndImageChars(content: ReadonlyArray<TextContent | ImageContent>): number {
let chars = 0;
for (const block of content) {
if (block.type === "text") {
chars += block.text.length;
chars += estimateWeightedTextChars(block.text);
}
if (block.type === "image") {
chars += IMAGE_CHAR_ESTIMATE;
@ -128,7 +132,7 @@ function estimateMessageChars(message: AgentMessage): number {
if (message.role === "user") {
const content = message.content;
if (typeof content === "string") {
return content.length;
return estimateWeightedTextChars(content);
}
return estimateTextAndImageChars(content);
}
@ -140,10 +144,10 @@ function estimateMessageChars(message: AgentMessage): number {
continue;
}
if (b.type === "text" && typeof b.text === "string") {
chars += b.text.length;
chars += estimateWeightedTextChars(b.text);
}
if (b.type === "thinking" && typeof b.thinking === "string") {
chars += b.thinking.length;
chars += estimateWeightedTextChars(b.thinking);
}
if (b.type === "toolCall") {
try {