From 711eb69e5b4ee226842709bd4e21e254048ea80b Mon Sep 17 00:00:00 2001
From: watsonk1998 <1515673657@qq.com>
Date: Mon, 13 Apr 2026 22:03:17 +0800
Subject: [PATCH] fix(query): improve keyword matching for CJK languages

---
 tools/query.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/tools/query.py b/tools/query.py
index bc44419..d18e3b7 100644
--- a/tools/query.py
+++ b/tools/query.py
@@ -42,14 +42,28 @@ def find_relevant_pages(question: str, index_content: str) -> list[Path]:
     """Extract linked pages from index that seem relevant to the question."""
     # Pull all [[links]] and markdown links from index
     md_links = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', index_content)
-    # Simple keyword match: check if any word in the title appears in the question
     question_lower = question.lower()
     relevant = []
+    
     for title, href in md_links:
-        if any(word in question_lower for word in title.lower().split() if len(word) > 3):
+        title_lower = title.lower()
+        match = False
+        
+        # 1. English/Space-separated: check words > 3 chars
+        if any(word in question_lower for word in title_lower.split() if len(word) > 3):
+            match = True
+        # 2. Exact substring match for the whole title (useful for short CJK titles, e.g. len=2)
+        elif len(title_lower) >= 2 and title_lower in question_lower:
+            match = True
+        # 3. CJK chunks: find contiguous non-ASCII characters (len >= 2) and check if in question
+        elif any(chunk in question_lower for chunk in re.findall(r'[^\x00-\x7F]{2,}', title_lower)):
+            match = True
+            
+        if match:
             p = WIKI_DIR / href
-            if p.exists():
+            if p.exists() and p not in relevant:
                 relevant.append(p)
+                
     # Always include overview
     overview = WIKI_DIR / "overview.md"
     if overview.exists() and overview not in relevant: