import { describe, expect, it } from "vitest"; import { expandQueryForFts, extractKeywords } from "./query-expansion.js"; describe("extractKeywords", () => { it("extracts keywords from English conversational query", () => { const keywords = extractKeywords("that thing we discussed about the API"); expect(keywords).toContain("discussed"); expect(keywords).toContain("api"); // Should not include stop words expect(keywords).not.toContain("that"); expect(keywords).not.toContain("thing"); expect(keywords).not.toContain("we"); expect(keywords).not.toContain("about"); expect(keywords).not.toContain("the"); }); it("extracts keywords from Chinese conversational query", () => { const keywords = extractKeywords("之前讨论的那个方案"); expect(keywords).toContain("讨论"); expect(keywords).toContain("方案"); // Should not include stop words expect(keywords).not.toContain("之前"); expect(keywords).not.toContain("的"); expect(keywords).not.toContain("那个"); }); it("extracts keywords from mixed language query", () => { const keywords = extractKeywords("昨天讨论的 API design"); expect(keywords).toContain("讨论"); expect(keywords).toContain("api"); expect(keywords).toContain("design"); }); it("returns specific technical terms", () => { const keywords = extractKeywords("what was the solution for the CFR bug"); expect(keywords).toContain("solution"); expect(keywords).toContain("cfr"); expect(keywords).toContain("bug"); }); it("extracts keywords from Korean conversational query", () => { const keywords = extractKeywords("어제 논의한 배포 전략"); expect(keywords).toContain("논의한"); expect(keywords).toContain("배포"); expect(keywords).toContain("전략"); // Should not include stop words expect(keywords).not.toContain("어제"); }); it("strips Korean particles to extract stems", () => { const keywords = extractKeywords("서버에서 발생한 에러를 확인"); expect(keywords).toContain("서버"); expect(keywords).toContain("에러"); expect(keywords).toContain("확인"); }); it("filters Korean stop words including inflected forms", () => { const keywords = extractKeywords("나는 그리고 그래서"); expect(keywords).not.toContain("나"); expect(keywords).not.toContain("나는"); expect(keywords).not.toContain("그리고"); expect(keywords).not.toContain("그래서"); }); it("filters inflected Korean stop words not explicitly listed", () => { const keywords = extractKeywords("그녀는 우리는"); expect(keywords).not.toContain("그녀는"); expect(keywords).not.toContain("우리는"); expect(keywords).not.toContain("그녀"); expect(keywords).not.toContain("우리"); }); it("does not produce bogus single-char stems from particle stripping", () => { const keywords = extractKeywords("논의"); expect(keywords).toContain("논의"); expect(keywords).not.toContain("논"); }); it("strips longest Korean trailing particles first", () => { const keywords = extractKeywords("기능으로 설명"); expect(keywords).toContain("기능"); expect(keywords).not.toContain("기능으"); }); it("keeps stripped ASCII stems for mixed Korean tokens", () => { const keywords = extractKeywords("API를 배포했다"); expect(keywords).toContain("api"); expect(keywords).toContain("배포했다"); }); it("handles mixed Korean and English query", () => { const keywords = extractKeywords("API 배포에 대한 논의"); expect(keywords).toContain("api"); expect(keywords).toContain("배포"); expect(keywords).toContain("논의"); }); it("extracts keywords from Japanese conversational query", () => { const keywords = extractKeywords("昨日話したデプロイ戦略"); expect(keywords).toContain("デプロイ"); expect(keywords).toContain("戦略"); expect(keywords).not.toContain("昨日"); }); it("handles mixed Japanese and English query", () => { const keywords = extractKeywords("昨日話したAPIのバグ"); expect(keywords).toContain("api"); expect(keywords).toContain("バグ"); expect(keywords).not.toContain("した"); }); it("filters Japanese stop words", () => { const keywords = extractKeywords("これ それ そして どう"); expect(keywords).not.toContain("これ"); expect(keywords).not.toContain("それ"); expect(keywords).not.toContain("そして"); expect(keywords).not.toContain("どう"); }); it("extracts keywords from Spanish conversational query", () => { const keywords = extractKeywords("ayer hablamos sobre la estrategia de despliegue"); expect(keywords).toContain("estrategia"); expect(keywords).toContain("despliegue"); expect(keywords).not.toContain("ayer"); expect(keywords).not.toContain("sobre"); }); it("extracts keywords from Portuguese conversational query", () => { const keywords = extractKeywords("ontem falamos sobre a estratégia de implantação"); expect(keywords).toContain("estratégia"); expect(keywords).toContain("implantação"); expect(keywords).not.toContain("ontem"); expect(keywords).not.toContain("sobre"); }); it("filters Spanish and Portuguese question stop words", () => { const keywords = extractKeywords("cómo cuando donde porquê quando onde"); expect(keywords).not.toContain("cómo"); expect(keywords).not.toContain("cuando"); expect(keywords).not.toContain("donde"); expect(keywords).not.toContain("porquê"); expect(keywords).not.toContain("quando"); expect(keywords).not.toContain("onde"); }); it("extracts keywords from Arabic conversational query", () => { const keywords = extractKeywords("بالأمس ناقشنا استراتيجية النشر"); expect(keywords).toContain("ناقشنا"); expect(keywords).toContain("استراتيجية"); expect(keywords).toContain("النشر"); expect(keywords).not.toContain("بالأمس"); }); it("filters Arabic question stop words", () => { const keywords = extractKeywords("كيف متى أين ماذا"); expect(keywords).not.toContain("كيف"); expect(keywords).not.toContain("متى"); expect(keywords).not.toContain("أين"); expect(keywords).not.toContain("ماذا"); }); it("handles empty query", () => { expect(extractKeywords("")).toEqual([]); expect(extractKeywords(" ")).toEqual([]); }); it("handles query with only stop words", () => { const keywords = extractKeywords("the a an is are"); expect(keywords.length).toBe(0); }); it("removes duplicate keywords", () => { const keywords = extractKeywords("test test testing"); const testCount = keywords.filter((k) => k === "test").length; expect(testCount).toBe(1); }); }); describe("expandQueryForFts", () => { it("returns original query and extracted keywords", () => { const result = expandQueryForFts("that API we discussed"); expect(result.original).toBe("that API we discussed"); expect(result.keywords).toContain("api"); expect(result.keywords).toContain("discussed"); }); it("builds expanded OR query for FTS", () => { const result = expandQueryForFts("the solution for bugs"); expect(result.expanded).toContain("OR"); expect(result.expanded).toContain("solution"); expect(result.expanded).toContain("bugs"); }); it("returns original query when no keywords extracted", () => { const result = expandQueryForFts("the"); expect(result.keywords.length).toBe(0); expect(result.expanded).toBe("the"); }); });