Files
llm-wiki-agent/tools/sync.py

568 lines
20 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Wiki ↔ Raw 三向同步工具
功能:
- 检测 raw/ 下文件变化(新增/修改/删除)
- 自动调用 ingest.py 进行同步
- 维护 manifest.json 状态映射
- 检测 orphan entity/concept仅报告不删除
用法:
python tools/sync.py --check 预览变化(不执行)
python tools/sync.py --sync 执行同步
python tools/sync.py --rebuild 从 manifest 重建 wiki/index兜底
python tools/sync.py --bootstrap 从现有 wiki sources 反向生成 manifest首次用跳过已 ingest 的文件)
manifest.json 格式:
{
"version": 1,
"updated_at": "ISO timestamp",
"files": {
"relative/path/to/file.md": {
"hash": "sha256",
"modified": "ISO timestamp",
"slug": "wiki-source-slug",
"source_path": "wiki/sources/slug.md",
"ingested": true
}
}
}
"""
import os
import sys
import json
import hashlib
import subprocess
from pathlib import Path
from datetime import datetime, timezone
REPO_ROOT = Path(__file__).parent.parent
WIKI_DIR = REPO_ROOT / "wiki"
MANIFEST_FILE = WIKI_DIR / "manifest.json"
SCHEMA_FILE = REPO_ROOT / "CLAUDE.md"
# ─── 工具函数 ───────────────────────────────────────────────
def green(text):
return f"\033[92m{text}\033[0m"
def yellow(text):
return f"\033[93m{text}\033[0m"
def red(text):
return f"\033[91m{text}\033[0m"
def dim(text):
return f"\033[2m{text}\033[0m"
def bold(text):
return f"\033[1m{text}\033[0m"
def log(msg, style="normal"):
prefixes = {
"normal": " ",
"info": " ",
"success": "",
"warn": "",
"error": "",
"section": "\n── ",
}
print(f"{prefixes.get(style, ' ')}{msg}")
def sha256_file(path: Path) -> str:
h = hashlib.sha256()
h.update(path.read_bytes())
return h.hexdigest()[:16]
def iso_now():
return datetime.now(timezone.utc).isoformat()
def load_manifest() -> dict:
if MANIFEST_FILE.exists():
try:
return json.loads(MANIFEST_FILE.read_text(encoding="utf-8"))
except (json.JSONDecodeError, IOError):
pass
return {"version": 1, "updated_at": iso_now(), "files": {}}
def save_manifest(manifest: dict):
manifest["updated_at"] = iso_now()
MANIFEST_FILE.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
def scan_raw() -> dict[str, dict]:
"""返回 {relative_path: {hash, modified, size}}"""
raw_dir = REPO_ROOT / "raw"
result = {}
if not raw_dir.exists():
return result
for p in raw_dir.rglob("*.md"):
if p.is_file() and not p.name.startswith("."):
rel = str(p.relative_to(REPO_ROOT))
stat = p.stat()
result[rel] = {
"hash": sha256_file(p),
"modified": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
"size": stat.st_size,
"abs_path": str(p),
}
return result
def build_slug_from_path(rel_path: str) -> str:
"""从相对路径生成 slug尽量保留中文kebab-case"""
name = Path(rel_path).stem
name = name.replace(" ", "-").replace("/", "-").replace("\\", "-")
name = "".join(c if c.isalnum() or c in ("-", "_", "·") else "-" for c in name)
name = name.strip("-")
return name or "untitled"
def call_ingest(source_path: str, slug: str = None) -> dict:
"""调用 ingest.py返回结果"""
cmd = [sys.executable, str(REPO_ROOT / "tools" / "ingest.py"), source_path]
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300,
cwd=str(REPO_ROOT),
)
return {
"success": result.returncode == 0,
"stdout": result.stdout,
"stderr": result.stderr,
}
except subprocess.TimeoutExpired:
return {"success": False, "stdout": "", "stderr": "Timeout (>5min)"}
except Exception as e:
return {"success": False, "stdout": "", "stderr": str(e)}
def find_orphan_entity_concept(manifest: dict) -> tuple[list, list]:
"""检测未被任何 source page 引用的 entity 和 concept"""
# 从所有 source 内容中提取 [[wikilinks]]
import re
wikilink_pattern = re.compile(r"\[\[([^\]]+)\]\]")
sources_dir = WIKI_DIR / "sources"
referenced_entities = set()
referenced_concepts = set()
if sources_dir.exists():
for src in sources_dir.glob("*.md"):
content = src.read_text(encoding="utf-8")
for link in wikilink_pattern.findall(content):
name = link.strip()
if name.startswith("entities/"):
referenced_entities.add(Path(name).stem)
elif name.startswith("concepts/"):
referenced_concepts.add(Path(name).stem)
elif "/" not in name:
# 裸 wikilink可能是 entity 或 concept
referenced_entities.add(name)
referenced_concepts.add(name)
# 检查 entity 目录
orphan_entities = []
entities_dir = WIKI_DIR / "entities"
if entities_dir.exists():
for f in entities_dir.glob("*.md"):
if f.stem not in referenced_entities:
orphan_entities.append(f.name)
# 检查 concept 目录
orphan_concepts = []
concepts_dir = WIKI_DIR / "concepts"
if concepts_dir.exists():
for f in concepts_dir.glob("*.md"):
if f.stem not in referenced_concepts:
orphan_concepts.append(f.name)
return orphan_entities, orphan_concepts
# ─── 核心同步逻辑 ───────────────────────────────────────────────
def check_changes(manifest: dict, raw_files: dict) -> dict:
"""对比 manifest 和实际 raw 文件,返回变化"""
changes = {"new": [], "updated": [], "deleted": [], "unchanged": []}
manifest_files = manifest.get("files", {})
# 遍历当前 raw 文件
for rel_path, info in raw_files.items():
if rel_path not in manifest_files:
changes["new"].append({"rel_path": rel_path, **info})
elif info["hash"] != manifest_files[rel_path]["hash"]:
changes["updated"].append({
"rel_path": rel_path,
"old_hash": manifest_files[rel_path]["hash"],
**info,
})
else:
changes["unchanged"].append(rel_path)
# 遍历 manifest找已删除的
for rel_path in manifest_files:
abs_path = REPO_ROOT / rel_path
if not abs_path.exists():
changes["deleted"].append({
"rel_path": rel_path,
"slug": manifest_files[rel_path].get("slug", build_slug_from_path(rel_path)),
"source_path": manifest_files[rel_path].get("source_path"),
})
return changes
def run_sync(dry_run: bool = False, verbose: bool = False):
print(f"\n{bold('=== Wiki Sync')}\n")
print(f" Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
print(f" Raw: {REPO_ROOT / 'raw'}")
print(f" Wiki: {WIKI_DIR}")
print(f" Mode: {'DRY-RUN (preview only)' if dry_run else 'LIVE SYNC'}")
print()
# Step 1: load manifest
manifest = load_manifest()
log("manifest.json loaded", "info")
# Step 2: scan raw/
raw_files = scan_raw()
log(f"raw/ scan: {len(raw_files)} .md files found", "info")
# Step 3: check changes
changes = check_changes(manifest, raw_files)
total_changes = len(changes["new"]) + len(changes["updated"]) + len(changes["deleted"])
if total_changes == 0:
log("No changes detected — wiki is up to date.", "success")
return
# ─── Report ───
print(f"\n{bold('--- Changes ---')}")
print(f" {green('+')} New: {len(changes['new'])}")
print(f" {yellow('~')} Updated: {len(changes['updated'])}")
print(f" {red('-')} Deleted: {len(changes['deleted'])}")
if verbose or not dry_run:
if changes["new"]:
print(f"\n {bold('New Files:')}")
for f in changes["new"]:
log(f"{green('[+')} {f['rel_path']}", "normal")
if changes["updated"]:
print(f"\n {bold('Updated Files:')}")
for f in changes["updated"]:
log(f"{yellow('[~]')} {f['rel_path']} (hash changed)", "normal")
if changes["deleted"]:
print(f"\n {bold('Deleted Files:')}")
for f in changes["deleted"]:
log(f"{red('[-]')} {f['rel_path']}", "normal")
if dry_run:
log("\nDry-run complete. Run with --sync to apply.", "warn")
return
# ─── Apply Sync ───
print(f"\n{bold('--- Applying Sync ---')}")
updated_manifest = manifest.copy()
updated_manifest["files"] = manifest.get("files", {}).copy()
# ① 新增 → ingest
for f in changes["new"]:
rel_path = f["rel_path"]
abs_path = f["abs_path"]
slug = build_slug_from_path(rel_path)
print(f"\n {green('[+]')} New: {rel_path}")
print(f" slug: {slug}")
result = call_ingest(abs_path, slug)
if result["success"]:
log(f"Ingested: {slug}.md", "success")
updated_manifest["files"][rel_path] = {
"hash": f["hash"],
"modified": f["modified"],
"slug": slug,
"source_path": f"wiki/sources/{slug}.md",
"ingested": True,
"ingested_at": iso_now(),
}
else:
log(f"Failed: {result['stderr'][:200]}", "error")
# 仍然记录(避免重复 ingest
updated_manifest["files"][rel_path] = {
"hash": f["hash"],
"modified": f["modified"],
"slug": slug,
"source_path": f"wiki/sources/{slug}.md",
"ingested": False,
"ingested_at": None,
"error": result["stderr"][:500],
}
# ② 修改 → re-ingest
for f in changes["updated"]:
rel_path = f["rel_path"]
abs_path = f["abs_path"]
old_slug = manifest["files"].get(rel_path, {}).get("slug") or build_slug_from_path(rel_path)
print(f"\n {yellow('[~]')} Updated: {rel_path}")
result = call_ingest(abs_path, old_slug)
if result["success"]:
log(f"Re-ingested: {old_slug}.md", "success")
updated_manifest["files"][rel_path] = {
**updated_manifest["files"].get(rel_path, {}),
"hash": f["hash"],
"modified": f["modified"],
"slug": old_slug,
"source_path": f"wiki/sources/{old_slug}.md",
"ingested": True,
"ingested_at": iso_now(),
}
else:
log(f"Failed: {result['stderr'][:200]}", "error")
# ③ 删除 → 保留 wiki 内容,仅从 manifest 移除(按用户要求保留 orphan
for f in changes["deleted"]:
rel_path = f["rel_path"]
source_path = f.get("source_path")
print(f"\n {red('[-]')} Deleted: {rel_path}")
if source_path:
sp = WIKI_DIR / source_path
log(f" Wiki source kept: {sp}", "warn")
# 从 manifest 移除(不删除 wiki 文件)
if rel_path in updated_manifest["files"]:
del updated_manifest["files"][rel_path]
# Step 4: Save manifest
save_manifest(updated_manifest)
log(f"\nmanifest.json updated ({len(updated_manifest['files'])} entries)", "success")
# Step 5: Orphan detection
orphan_entities, orphan_concepts = find_orphan_entity_concept(updated_manifest)
if orphan_entities or orphan_concepts:
print(f"\n{bold('--- Orphan Report (kept as requested) ---')}")
if orphan_entities:
print(f" {bold('Orphan Entities')} ({len(orphan_entities)}):")
for e in sorted(orphan_entities):
print(f" {dim('?')} {e}")
if orphan_concepts:
print(f" {bold('Orphan Concepts')} ({len(orphan_concepts)}):")
for c in sorted(orphan_concepts):
print(f" {dim('?')} {c}")
log("\nOrphan pages are kept (not deleted per user request).", "info")
else:
log("No orphan entity/concept detected.", "success")
print(f"\n{bold('Done.')}")
def run_bootstrap():
"""从现有 wiki sources 反向生成 manifest跳过已 ingest 的文件"""
import re
print(f"\n{bold('=== Wiki Bootstrap')}\n")
print(f" Scanning existing wiki sources to build manifest ...\n")
sources_dir = WIKI_DIR / "sources"
if not sources_dir.exists():
print(f" {red('')} No wiki/sources/ directory found. Nothing to bootstrap.")
return
wikilink_pattern = re.compile(r"\[\[?raw/([^\]\s]+\.md)\]?]?", re.IGNORECASE)
manifest = {"version": 1, "updated_at": iso_now(), "files": {}}
raw_dir = (REPO_ROOT / "raw").resolve() # 解析 symlink 到真实路径
repo_raw_prefix = str(REPO_ROOT / "raw") # 用于 strip 前缀得到相对路径
bootstrapped = 0
skipped_not_found = 0
skipped_no_source_field = 0
for src in sources_dir.glob("*.md"):
content = src.read_text(encoding="utf-8")
# 尝试从 ## Source File 字段提取原始路径
match = wikilink_pattern.search(content)
if not match:
skipped_no_source_field += 1
continue
# raw_rel 格式如 "Agent/usecases/xxx.md"(不含 raw/ 前缀)
raw_rel = match.group(1).lstrip("/")
# 用 resolved 后的 raw_dir 拼接follow symlink
raw_path = raw_dir / raw_rel
if not raw_path.exists():
# 文件已删除,保留 source page 但不加入 manifest
skipped_not_found += 1
continue
stat = raw_path.stat()
file_hash = sha256_file(raw_path)
slug = src.stem
# manifest key 用 "raw/Agent/xxx.md" 格式REPO_ROOT 相对路径)
manifest_key = f"raw/{raw_rel}"
manifest["files"][manifest_key] = {
"hash": file_hash,
"modified": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
"slug": slug,
"source_path": f"wiki/sources/{slug}.md",
"ingested": True,
"ingested_at": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
}
bootstrapped += 1
save_manifest(manifest)
print(f" {bold('Result:')}")
print(f" {green('')} Manifest entries created: {bootstrapped}")
print(f" {yellow('~')} Skipped (source file deleted): {skipped_not_found}")
print(f" {dim('-')} Skipped (no source_file field): {skipped_no_source_field}")
print(f"\n {green('')} manifest.json created at: {MANIFEST_FILE}")
print(f"\n Run now: {bold('python tools/sync.py --check')} to preview new/updated files.\n")
def run_check():
"""只预览变化,不执行"""
manifest = load_manifest()
raw_files = scan_raw()
changes = check_changes(manifest, raw_files)
total = len(changes["new"]) + len(changes["updated"]) + len(changes["deleted"])
print(f"\n{bold('=== Wiki Sync Check')} (preview mode)\n")
print(f" Raw files: {len(raw_files)}")
print(f" Manifest entries: {len(manifest.get('files', {}))}")
print(f" {green('+')} New: {len(changes['new'])}")
print(f" {yellow('~')} Updated: {len(changes['updated'])}")
print(f" {red('-')} Deleted: {len(changes['deleted'])}")
if total > 0:
if changes["new"]:
print(f"\n {bold('New Files:')}")
for f in changes["new"]:
print(f" {green('[+]')} {f['rel_path']}")
if changes["updated"]:
print(f"\n {bold('Updated Files:')}")
for f in changes["updated"]:
print(f" {yellow('[~]')} {f['rel_path']} (was {f['old_hash']}, now {f['hash']})")
if changes["deleted"]:
print(f"\n {bold('Deleted Files:')}")
for f in changes["deleted"]:
print(f" {red('[-]')} {f['rel_path']}")
else:
print(f"\n {green('No changes — wiki is in sync.')}")
print()
def run_rebuild():
"""从 manifest 重建 wiki/index.md兜底方案"""
manifest = load_manifest()
print(f"\n{bold('=== Wiki Rebuild from Manifest')}\n")
print(f" Manifest entries: {len(manifest.get('files', {}))}")
print(f" Rebuilding index.md ...\n")
index_lines = [
"# Wiki Index\n",
"\n## Overview\n",
"- [Overview](overview.md) — living synthesis\n",
"\n## Sources\n",
]
files = manifest.get("files", {})
# 按 modified 时间倒序
sorted_files = sorted(files.items(), key=lambda x: x[1].get("modified", ""), reverse=True)
for rel_path, info in sorted_files:
slug = info.get("slug", build_slug_from_path(rel_path))
source_md_path = WIKI_DIR / "sources" / f"{slug}.md"
if source_md_path.exists():
title = source_md_path.read_text(encoding="utf-8").split("\n")[0].lstrip("# ").strip()
index_lines.append(f"- [{title}](sources/{slug}.md)\n")
else:
index_lines.append(f"- [{slug}](sources/{slug}.md) — (source missing)\n")
index_lines.append("\n## Entities\n\n## Concepts\n\n## Syntheses\n")
index_file = WIKI_DIR / "index.md"
index_file.write_text("".join(index_lines), encoding="utf-8")
print(f" {green('')} index.md rebuilt with {len(sorted_files)} sources")
# Orphan report
orphan_entities, orphan_concepts = find_orphan_entity_concept(manifest)
if orphan_entities:
print(f" {dim('?')} Orphan entities: {len(orphan_entities)}")
if orphan_concepts:
print(f" {dim('?')} Orphan concepts: {len(orphan_concepts)}")
print(f"\nDone.")
# ─── CLI 入口 ───────────────────────────────────────────────
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Wiki ↔ Raw 三向同步工具",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--check",
action="store_true",
help="预览变化,不执行同步",
)
parser.add_argument(
"--sync",
action="store_true",
help="执行完整同步(新增/修改/删除 + orphan 检测)",
)
parser.add_argument(
"--rebuild",
action="store_true",
help="从 manifest 重建 wiki/index.md兜底方案",
)
parser.add_argument(
"--bootstrap",
action="store_true",
help="从现有 wiki sources 反向生成 manifest首次使用跳过已 ingest 的文件)",
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="详细输出",
)
args = parser.parse_args()
if args.bootstrap:
run_bootstrap()
elif args.rebuild:
run_rebuild()
elif args.check:
run_check()
elif args.sync:
run_sync(dry_run=False, verbose=args.verbose)
else:
parser.print_help()
print("\n示例:")
print(" python tools/sync.py --check # 预览变化")
print(" python tools/sync.py --sync # 执行同步")
print(" python tools/sync.py --sync -v # 详细模式")
print(" python tools/sync.py --rebuild # 重建 index")
print(" python tools/sync.py --bootstrap # 首次:从 wiki sources 生成 manifest")