Files
llm-wiki-agent/tools/sync.py
2026-04-16 17:09:26 +08:00

632 lines
22 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Wiki ↔ Raw 三向同步工具
功能:
- 检测 raw/ 下文件变化(新增/修改/删除)
- 调用 Claude Code agent 进行同步(不使用 litellm
- 维护 manifest.json 状态映射
- 检测 orphan entity/concept仅报告不删除
用法:
python tools/sync.py --check 预览变化(不执行)
python tools/sync.py --sync 执行同步
python tools/sync.py --rebuild 从 manifest 重建 wiki/index兜底
manifest.json 格式:
{
"version": 1,
"updated_at": "ISO timestamp",
"files": {
"relative/path/to/file.md": {
"hash": "sha256",
"modified": "ISO timestamp",
"slug": "wiki-source-slug",
"source_path": "wiki/sources/slug.md",
"ingested": true
}
}
}
"""
import os
import sys
import json
import hashlib
import subprocess
from pathlib import Path
from datetime import datetime, timezone
REPO_ROOT = Path(__file__).parent.parent.resolve()
WIKI_DIR = REPO_ROOT / "wiki"
# manifest.json 放在 tools/ 而非 wiki/,避免 symlink 路径问题
MANIFEST_FILE = Path(__file__).parent / "manifest.json"
SCHEMA_FILE = REPO_ROOT / "CLAUDE.md"
# ─── 工具函数 ───────────────────────────────────────────────
def green(text):
return f"\033[92m{text}\033[0m"
def yellow(text):
return f"\033[93m{text}\033[0m"
def red(text):
return f"\033[91m{text}\033[0m"
def dim(text):
return f"\033[2m{text}\033[0m"
def bold(text):
return f"\033[1m{text}\033[0m"
def log(msg, style="normal"):
prefixes = {
"normal": " ",
"info": " ",
"success": "",
"warn": "",
"error": "",
"section": "\n── ",
}
print(f"{prefixes.get(style, ' ')}{msg}")
def sha256_file(path: Path) -> str:
h = hashlib.sha256()
h.update(path.read_bytes())
return h.hexdigest()[:16]
def iso_now():
return datetime.now(timezone.utc).isoformat()
def load_manifest() -> dict:
if MANIFEST_FILE.exists():
try:
return json.loads(MANIFEST_FILE.read_text(encoding="utf-8"))
except (json.JSONDecodeError, IOError):
pass
return {"version": 1, "updated_at": iso_now(), "files": {}}
def save_manifest(manifest: dict):
manifest["updated_at"] = iso_now()
MANIFEST_FILE.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
def scan_raw() -> dict[str, dict]:
"""返回 {relative_path: {hash, modified, size}}"""
raw_dir = REPO_ROOT / "raw"
result = {}
if not raw_dir.exists():
return result
for p in raw_dir.rglob("*.md"):
if p.is_file() and not p.name.startswith("."):
rel = str(p.relative_to(REPO_ROOT))
stat = p.stat()
result[rel] = {
"hash": sha256_file(p),
"modified": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
"size": stat.st_size,
"abs_path": str(p),
}
return result
def build_slug_from_path(rel_path: str) -> str:
"""从相对路径生成 slug尽量保留中文kebab-case"""
name = Path(rel_path).stem
name = name.replace(" ", "-").replace("/", "-").replace("\\", "-")
name = "".join(c if c.isalnum() or c in ("-", "_", "·") else "-" for c in name)
name = name.strip("-")
return name or "untitled"
def call_ingest(abs_path: str, slug: str, json_mode: bool = False) -> dict:
"""调用 Claude Code agent 执行 /wiki-ingest返回结果
使用 TMUX 交互模式调用 Claude Code Agent
返回 special status 让 Hermes 可以手动处理 TMUX 会话
"""
return {
"status": "needs_manual_tmux",
"abs_path": abs_path,
"slug": slug,
"message": "需要通过 TMUX 手动执行 /wiki-ingest请使用 llm-wiki-sync 技能流程",
}
def parse_slug_from_output(output: str) -> str | None:
"""从 TMUX 输出中解析 SLUG: xxx 行"""
import re
match = re.search(r"SLUG:\s*([a-zA-Z0-9_-]+)", output)
return match.group(1) if match else None
def update_manifest_with_slug(rel_path: str, actual_slug: str) -> bool:
"""更新 manifest 中的 slug 和 source_path"""
import json
from datetime import datetime, timezone
manifest_file = Path(__file__).parent / "manifest.json"
if not manifest_file.exists():
return False
try:
manifest = json.loads(manifest_file.read_text(encoding="utf-8"))
if rel_path in manifest["files"]:
manifest["files"][rel_path]["slug"] = actual_slug
manifest["files"][rel_path]["source_path"] = f"wiki/sources/{actual_slug}.md"
manifest["files"][rel_path]["ingested"] = True
manifest["files"][rel_path]["ingested_at"] = datetime.now(timezone.utc).isoformat()
manifest["updated_at"] = datetime.now(timezone.utc).isoformat()
manifest_file.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
return True
except Exception as e:
print(f"Error updating manifest: {e}")
return False
def start_tmux_session() -> bool:
"""启动 TMUX session 用于 wiki ingest"""
TMUX_SESSION = "wiki-ingest"
# 检查 TMUX session 是否存在
check = subprocess.run(
["tmux", "has-session", "-t", TMUX_SESSION],
capture_output=True,
)
if check.returncode == 0:
print(f" TMUX session '{TMUX_SESSION}' already exists")
return True
# 创建新 session
subprocess.run(
["tmux", "new-session", "-d", "-s", TMUX_SESSION,
f"cd {REPO_ROOT} && claude --permission-mode bypassPermissions"],
check=True,
)
print(f" Created TMUX session '{TMUX_SESSION}'")
return True
def send_to_tmux(command: str) -> bool:
"""发送命令到 TMUX session"""
TMUX_SESSION = "wiki-ingest"
subprocess.run(
["tmux", "send-keys", "-t", TMUX_SESSION, f"{command}\n"],
check=True,
)
return True
def wait_for_completion(seconds: int = 120) -> bool:
"""等待 Claude Code 完成处理"""
import time
print(f" Waiting {seconds}s for Claude Code to complete...")
time.sleep(seconds)
return True
def cleanup_tmux() -> bool:
"""清理 TMUX session"""
TMUX_SESSION = "wiki-ingest"
subprocess.run(
["tmux", "kill-session", "-t", TMUX_SESSION],
capture_output=True,
)
print(f" Killed TMUX session '{TMUX_SESSION}'")
return True
def find_orphan_entity_concept(manifest: dict) -> tuple[list, list]:
"""检测未被任何 source page 引用的 entity 和 concept"""
# 从所有 source 内容中提取 [[wikilinks]]
import re
wikilink_pattern = re.compile(r"\[\[([^\]]+)\]\]")
sources_dir = WIKI_DIR / "sources"
referenced_entities = set()
referenced_concepts = set()
if sources_dir.exists():
for src in sources_dir.glob("*.md"):
content = src.read_text(encoding="utf-8")
for link in wikilink_pattern.findall(content):
name = link.strip()
if name.startswith("entities/"):
referenced_entities.add(Path(name).stem)
elif name.startswith("concepts/"):
referenced_concepts.add(Path(name).stem)
elif "/" not in name:
# 裸 wikilink可能是 entity 或 concept
referenced_entities.add(name)
referenced_concepts.add(name)
# 检查 entity 目录
orphan_entities = []
entities_dir = WIKI_DIR / "entities"
if entities_dir.exists():
for f in entities_dir.glob("*.md"):
if f.stem not in referenced_entities:
orphan_entities.append(f.name)
# 检查 concept 目录
orphan_concepts = []
concepts_dir = WIKI_DIR / "concepts"
if concepts_dir.exists():
for f in concepts_dir.glob("*.md"):
if f.stem not in referenced_concepts:
orphan_concepts.append(f.name)
return orphan_entities, orphan_concepts
# ─── 核心同步逻辑 ───────────────────────────────────────────────
def check_changes(manifest: dict, raw_files: dict) -> dict:
"""对比 manifest 和实际 raw 文件,返回变化"""
changes = {"new": [], "updated": [], "deleted": [], "unchanged": []}
manifest_files = manifest.get("files", {})
# 遍历当前 raw 文件
for rel_path, info in raw_files.items():
if rel_path not in manifest_files:
changes["new"].append({"rel_path": rel_path, **info})
elif info["hash"] != manifest_files[rel_path]["hash"]:
changes["updated"].append({
"rel_path": rel_path,
"old_hash": manifest_files[rel_path]["hash"],
**info,
})
else:
changes["unchanged"].append(rel_path)
# 遍历 manifest找已删除的
for rel_path in manifest_files:
abs_path = REPO_ROOT / rel_path
if not abs_path.exists():
changes["deleted"].append({
"rel_path": rel_path,
"slug": manifest_files[rel_path].get("slug", build_slug_from_path(rel_path)),
"source_path": manifest_files[rel_path].get("source_path"),
})
return changes
def run_sync(dry_run: bool = False, verbose: bool = False, json_mode: bool = False):
print(f"\n{bold('=== Wiki Sync')}\n", end="")
print(f" Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
print(f" Raw: {REPO_ROOT / 'raw'}\n")
print(f" Wiki: {WIKI_DIR}\n")
print(f" Mode: {'DRY-RUN (preview only)' if dry_run else 'LIVE SYNC'}")
# Step 1: load manifest
manifest = load_manifest()
log("manifest.json loaded", "info")
# Step 2: scan raw/
raw_files = scan_raw()
log(f"raw/ scan: {len(raw_files)} .md files found", "info")
# Step 3: check changes
changes = check_changes(manifest, raw_files)
total_changes = len(changes["new"]) + len(changes["updated"]) + len(changes["deleted"])
if total_changes == 0:
log("No changes detected — wiki is up to date.", "success")
return
# ─── Report ───
if not json_mode:
print(f"\n{bold('--- Changes ---')}")
print(f" {green('+')} New: {len(changes['new'])}")
print(f" {yellow('~')} Updated: {len(changes['updated'])}")
print(f" {red('-')} Deleted: {len(changes['deleted'])}")
if verbose or not dry_run:
if changes["new"]:
if not json_mode:
print(f"\n {bold('New Files:')}")
for f in changes["new"]:
if json_mode:
print(json.dumps({
"event": "new_detected",
"rel_path": f["rel_path"],
"slug": build_slug_from_path(f["rel_path"]),
}))
else:
log(f"{green('[+')} {f['rel_path']}", "normal")
if changes["updated"]:
if not json_mode:
print(f"\n {bold('Updated Files:')}")
for f in changes["updated"]:
if json_mode:
print(json.dumps({
"event": "updated_detected",
"rel_path": f["rel_path"],
"slug": manifest["files"].get(f["rel_path"], {}).get("slug") or build_slug_from_path(f["rel_path"]),
}))
else:
log(f"{yellow('[~]')} {f['rel_path']} (hash changed)", "normal")
if changes["deleted"]:
if not json_mode:
print(f"\n {bold('Deleted Files:')}")
for f in changes["deleted"]:
if json_mode:
print(json.dumps({
"event": "deleted_detected",
"rel_path": f["rel_path"],
}))
else:
log(f"{red('[-]')} {f['rel_path']}", "normal")
if dry_run:
log("\nDry-run complete. Run with --sync to apply.", "warn")
return
# ─── Apply Sync ───
# 注意call_ingest 现在返回 needs_manual_tmux提示 Hermes 通过 TMUX 手动执行
# 这里只做 manifest 记录,不实际调用 Claude Code
if not json_mode:
print(f"\n{bold('--- Applying Sync ---')}")
print(" Note: Ingest execution requires manual TMUX workflow")
print(" Use llm-wiki-sync skill for actual ingestion")
print()
updated_manifest = manifest.copy()
updated_manifest["files"] = manifest.get("files", {}).copy()
# 标记新增和更新的文件为待处理(由 Hermes 手动执行)
pending_files = []
# ① 新增 → 标记待处理
for f in changes["new"]:
rel_path = f["rel_path"]
abs_path = f["abs_path"]
slug = build_slug_from_path(rel_path)
if json_mode:
print(json.dumps({"event": "pending", "rel_path": rel_path, "slug": slug, "action": "new"}))
pending_files.append({"rel_path": rel_path, "abs_path": abs_path, "slug": slug, "action": "new"})
# 先不更新 manifest等实际执行完成后再更新
# updated_manifest["files"][rel_path] = {...}
# ② 修改 → 标记待处理
for f in changes["updated"]:
rel_path = f["rel_path"]
abs_path = f["abs_path"]
old_slug = manifest["files"].get(rel_path, {}).get("slug") or build_slug_from_path(rel_path)
if json_mode:
print(json.dumps({"event": "pending", "rel_path": rel_path, "slug": old_slug, "action": "updated"}))
pending_files.append({"rel_path": rel_path, "abs_path": abs_path, "slug": old_slug, "action": "updated"})
# ③ 删除 → 保留 wiki 内容,仅从 manifest 移除
deleted_files = []
for f in changes["deleted"]:
rel_path = f["rel_path"]
source_path = f.get("source_path")
log(f"Deleted: {rel_path}", "warn")
if source_path:
sp = WIKI_DIR / source_path
log(f" Wiki source kept: {sp}", "warn")
if rel_path in updated_manifest["files"]:
del updated_manifest["files"][rel_path]
deleted_files.append(rel_path)
# Step 4: Save manifest不保存 pending files等待实际执行后更新
# 注意:不再在 --sync 时自动更新 manifest而是由 Hermes 手动更新
# 这样可以追踪哪些文件需要处理
# 只保存已删除的文件变更
save_manifest(updated_manifest)
log(f"\nmanifest.json updated ({len(updated_manifest['files'])} entries)", "success")
# 输出 pending files 列表,供 Hermes 手动执行
if json_mode:
print(json.dumps({
"event": "sync_complete",
"summary": {
"pending": len(pending_files),
"deleted": len(deleted_files),
"manifest_entries": len(updated_manifest["files"]),
},
"pending_files": pending_files,
"deleted_files": deleted_files,
}))
# Step 5: Orphan detection
orphan_entities, orphan_concepts = find_orphan_entity_concept(updated_manifest)
if not json_mode:
if orphan_entities or orphan_concepts:
print(f"\n{bold('--- Orphan Report (kept as requested) ---')}")
if orphan_entities:
print(f" {bold('Orphan Entities')} ({len(orphan_entities)}):")
for e in sorted(orphan_entities):
print(f" {dim('?')} {e}")
if orphan_concepts:
print(f" {bold('Orphan Concepts')} ({len(orphan_concepts)}):")
for c in sorted(orphan_concepts):
print(f" {dim('?')} {c}")
log("\nOrphan pages are kept (not deleted per user request).", "info")
else:
log("No orphan entity/concept detected.", "success")
# Step 6: JSON summary已在上面输出 pending_files
# 旧的 summary 输出已删除,因为不再自动执行 ingest
print(f"\n{bold('Done.')}")
print(f"\n Pending files for manual TMUX ingestion: {len(pending_files)}")
print(" Use llm-wiki-sync skill to process these files.")
def run_check():
"""只预览变化,不执行"""
manifest = load_manifest()
raw_files = scan_raw()
changes = check_changes(manifest, raw_files)
total = len(changes["new"]) + len(changes["updated"]) + len(changes["deleted"])
print(f"\n{bold('=== Wiki Sync Check')} (preview mode)\n")
print(f" Raw files: {len(raw_files)}")
print(f" Manifest entries: {len(manifest.get('files', {}))}")
print(f" {green('+')} New: {len(changes['new'])}")
print(f" {yellow('~')} Updated: {len(changes['updated'])}")
print(f" {red('-')} Deleted: {len(changes['deleted'])}")
if total > 0:
if changes["new"]:
print(f"\n {bold('New Files:')}")
for f in changes["new"]:
print(f" {green('[+]')} {f['rel_path']}")
if changes["updated"]:
print(f"\n {bold('Updated Files:')}")
for f in changes["updated"]:
print(f" {yellow('[~]')} {f['rel_path']} (was {f['old_hash']}, now {f['hash']})")
if changes["deleted"]:
print(f"\n {bold('Deleted Files:')}")
for f in changes["deleted"]:
print(f" {red('[-]')} {f['rel_path']}")
else:
print(f"\n {green('No changes — wiki is in sync.')}")
print()
def run_rebuild():
"""从 manifest 重建 wiki/index.md兜底方案"""
manifest = load_manifest()
print(f"\n{bold('=== Wiki Rebuild from Manifest')}\n")
print(f" Manifest entries: {len(manifest.get('files', {}))}")
print(f" Rebuilding index.md ...\n")
index_lines = [
"# Wiki Index\n",
"\n## Overview\n",
"- [Overview](overview.md) — living synthesis\n",
"\n## Sources\n",
]
files = manifest.get("files", {})
# 按 modified 时间倒序
sorted_files = sorted(files.items(), key=lambda x: x[1].get("modified", ""), reverse=True)
for rel_path, info in sorted_files:
slug = info.get("slug", build_slug_from_path(rel_path))
source_md_path = WIKI_DIR / "sources" / f"{slug}.md"
if source_md_path.exists():
title = source_md_path.read_text(encoding="utf-8").split("\n")[0].lstrip("# ").strip()
index_lines.append(f"- [{title}](sources/{slug}.md)\n")
else:
index_lines.append(f"- [{slug}](sources/{slug}.md) — (source missing)\n")
index_lines.append("\n## Entities\n\n## Concepts\n\n## Syntheses\n")
index_file = WIKI_DIR / "index.md"
index_file.write_text("".join(index_lines), encoding="utf-8")
print(f" {green('')} index.md rebuilt with {len(sorted_files)} sources")
# Orphan report
orphan_entities, orphan_concepts = find_orphan_entity_concept(manifest)
if orphan_entities:
print(f" {dim('?')} Orphan entities: {len(orphan_entities)}")
if orphan_concepts:
print(f" {dim('?')} Orphan concepts: {len(orphan_concepts)}")
print(f"\nDone.")
# ─── CLI 入口 ───────────────────────────────────────────────
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Wiki ↔ Raw 三向同步工具",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--check",
action="store_true",
help="预览变化,不执行同步",
)
parser.add_argument(
"--sync",
action="store_true",
help="执行完整同步(新增/修改/删除 + orphan 检测)",
)
parser.add_argument(
"--rebuild",
action="store_true",
help="从 manifest 重建 wiki/index.md兜底方案",
)
parser.add_argument(
"--reset-failed",
action="store_true",
help="重置所有 failed 的 ingest 状态(让它们重新待处理)",
)
parser.add_argument(
"--pending",
action="store_true",
help="列出所有待摄取的 pending 文件",
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="详细输出",
)
parser.add_argument(
"--json",
action="store_true",
help="JSON 行输出模式(供调用方解析)",
)
args = parser.parse_args()
if args.rebuild:
run_rebuild()
elif args.pending:
# 列出待摄取的文件
manifest = load_manifest()
pending = [k for k, v in manifest["files"].items() if not v.get("ingested")]
print(f"=== Pending Ingest Files ({len(pending)}) ===\n")
for i, path in enumerate(pending, 1):
print(f"{i:3}. {path}")
elif args.reset_failed:
# 重置失败的 ingest 状态
manifest = load_manifest()
reset_count = 0
for k, v in manifest["files"].items():
if v.get("error"):
v["ingested"] = False
v.pop("error", None)
v.pop("ingested_at", None)
reset_count += 1
if reset_count > 0:
save_manifest(manifest)
print(f"Reset {reset_count} failed entries to pending.")
else:
print("No failed entries found.")
elif args.check:
run_check()
elif args.sync:
run_sync(dry_run=False, verbose=args.verbose, json_mode=args.json)
else:
parser.print_help()
print("\n示例:")
print(" python tools/sync.py --check # 预览变化")
print(" python tools/sync.py --sync # 执行同步")
print(" python tools/sync.py --sync -v # 详细模式")
print(" python tools/sync.py --rebuild # 重建 index")