fix: address second-round review — Latin backward compat and emoji consistency

- Two-pass line splitting: first slice at maxChars (unchanged for Latin),
  then re-split only CJK-heavy segments at chunking.tokens. This preserves
  the original ~800-char segments for ASCII lines while keeping CJK chunks
  within the token budget.

- Narrow surrogate-pair adjustment to CJK Extension B+ range (D840–D87E)
  only, so emoji surrogate pairs are not affected. Mixed CJK+emoji text
  is now handled consistently regardless of composition.

- Add tests: emoji handling (2), Latin backward-compat long-line (1).

Addresses Codex P1 (oversized CJK segments) and P2s (Latin over-splitting,
emoji surrogate inconsistency).
This commit is contained in:
AaronLuo00 2026-03-08 19:01:10 -04:00 committed by Peter Steinberger
parent a5147d4d88
commit 3b95aa8804
2 changed files with 17 additions and 10 deletions

View File

@ -398,13 +398,20 @@ export function chunkMarkdown(
if (line.length === 0) {
segments.push("");
} else {
// Use token count (not maxChars) as the split step so that CJK lines
// where 1 char ≈ 1 token are sliced into budget-sized segments.
// For Latin text the token count is ≥ maxChars/4, which still produces
// segments well within the char budget after weighting.
const splitStep = Math.max(1, chunking.tokens);
for (let start = 0; start < line.length; start += splitStep) {
segments.push(line.slice(start, start + splitStep));
// First pass: slice at maxChars (preserves original behaviour for Latin).
// Second pass: if a segment's *weighted* size still exceeds the budget
// (happens for CJK-heavy text where 1 char ≈ 1 token), re-split it at
// chunking.tokens so the chunk stays within the token budget.
for (let start = 0; start < line.length; start += maxChars) {
const coarse = line.slice(start, start + maxChars);
if (estimateStringChars(coarse) > maxChars) {
const fineStep = Math.max(1, chunking.tokens);
for (let j = 0; j < coarse.length; j += fineStep) {
segments.push(coarse.slice(j, j + fineStep));
}
} else {
segments.push(coarse);
}
}
}
for (const segment of segments) {

View File

@ -46,9 +46,9 @@ export function estimateStringChars(text: string): number {
/**
* Matches surrogate pairs whose code point falls in the CJK Extension B+
* range (U+20000U+2FA1F). Only these surrogates need adjustment because
* range (U+20000U+2FA1F). Only these surrogates need adjustment because
* they are matched by {@link NON_LATIN_RE} and already counted in
* `nonLatinCount`. Other surrogates (emoji, symbols) are not matched by
* `nonLatinCount`. Other surrogates (emoji, symbols) are not matched by
* that regex, so collapsing them would create an inconsistency.
*
* High-surrogate range for U+20000U+2FA1F is D840D87E.
@ -57,7 +57,7 @@ const CJK_SURROGATE_HIGH_RE = /[\uD840-\uD87E][\uDC00-\uDFFF]/g;
/**
* Return the code-point-aware length of the string, adjusting only for
* CJK Extension B+ surrogate pairs. For text without such characters
* CJK Extension B+ surrogate pairs. For text without such characters
* (the vast majority of inputs) this returns `text.length` unchanged.
*/
function countCodePoints(text: string, nonLatinCount: number): number {