568 lines
20 KiB
Python
Executable File
568 lines
20 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Wiki ↔ Raw 三向同步工具
|
||
|
||
功能:
|
||
- 检测 raw/ 下文件变化(新增/修改/删除)
|
||
- 自动调用 ingest.py 进行同步
|
||
- 维护 manifest.json 状态映射
|
||
- 检测 orphan entity/concept(仅报告,不删除)
|
||
|
||
用法:
|
||
python tools/sync.py --check 预览变化(不执行)
|
||
python tools/sync.py --sync 执行同步
|
||
python tools/sync.py --rebuild 从 manifest 重建 wiki/index(兜底)
|
||
python tools/sync.py --bootstrap 从现有 wiki sources 反向生成 manifest(首次用,跳过已 ingest 的文件)
|
||
|
||
manifest.json 格式:
|
||
{
|
||
"version": 1,
|
||
"updated_at": "ISO timestamp",
|
||
"files": {
|
||
"relative/path/to/file.md": {
|
||
"hash": "sha256",
|
||
"modified": "ISO timestamp",
|
||
"slug": "wiki-source-slug",
|
||
"source_path": "wiki/sources/slug.md",
|
||
"ingested": true
|
||
}
|
||
}
|
||
}
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import hashlib
|
||
import subprocess
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
|
||
|
||
REPO_ROOT = Path(__file__).parent.parent
|
||
WIKI_DIR = REPO_ROOT / "wiki"
|
||
MANIFEST_FILE = WIKI_DIR / "manifest.json"
|
||
SCHEMA_FILE = REPO_ROOT / "CLAUDE.md"
|
||
|
||
|
||
# ─── 工具函数 ───────────────────────────────────────────────
|
||
|
||
def green(text):
|
||
return f"\033[92m{text}\033[0m"
|
||
|
||
def yellow(text):
|
||
return f"\033[93m{text}\033[0m"
|
||
|
||
def red(text):
|
||
return f"\033[91m{text}\033[0m"
|
||
|
||
def dim(text):
|
||
return f"\033[2m{text}\033[0m"
|
||
|
||
def bold(text):
|
||
return f"\033[1m{text}\033[0m"
|
||
|
||
|
||
def log(msg, style="normal"):
|
||
prefixes = {
|
||
"normal": " ",
|
||
"info": " ℹ ",
|
||
"success": " ✓ ",
|
||
"warn": " ⚠ ",
|
||
"error": " ✗ ",
|
||
"section": "\n── ",
|
||
}
|
||
print(f"{prefixes.get(style, ' ')}{msg}")
|
||
|
||
|
||
def sha256_file(path: Path) -> str:
|
||
h = hashlib.sha256()
|
||
h.update(path.read_bytes())
|
||
return h.hexdigest()[:16]
|
||
|
||
|
||
def iso_now():
|
||
return datetime.now(timezone.utc).isoformat()
|
||
|
||
|
||
def load_manifest() -> dict:
|
||
if MANIFEST_FILE.exists():
|
||
try:
|
||
return json.loads(MANIFEST_FILE.read_text(encoding="utf-8"))
|
||
except (json.JSONDecodeError, IOError):
|
||
pass
|
||
return {"version": 1, "updated_at": iso_now(), "files": {}}
|
||
|
||
|
||
def save_manifest(manifest: dict):
|
||
manifest["updated_at"] = iso_now()
|
||
MANIFEST_FILE.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
|
||
|
||
def scan_raw() -> dict[str, dict]:
|
||
"""返回 {relative_path: {hash, modified, size}}"""
|
||
raw_dir = REPO_ROOT / "raw"
|
||
result = {}
|
||
if not raw_dir.exists():
|
||
return result
|
||
for p in raw_dir.rglob("*.md"):
|
||
if p.is_file() and not p.name.startswith("."):
|
||
rel = str(p.relative_to(REPO_ROOT))
|
||
stat = p.stat()
|
||
result[rel] = {
|
||
"hash": sha256_file(p),
|
||
"modified": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
|
||
"size": stat.st_size,
|
||
"abs_path": str(p),
|
||
}
|
||
return result
|
||
|
||
|
||
def build_slug_from_path(rel_path: str) -> str:
|
||
"""从相对路径生成 slug(尽量保留中文,kebab-case)"""
|
||
name = Path(rel_path).stem
|
||
name = name.replace(" ", "-").replace("/", "-").replace("\\", "-")
|
||
name = "".join(c if c.isalnum() or c in ("-", "_", "·") else "-" for c in name)
|
||
name = name.strip("-")
|
||
return name or "untitled"
|
||
|
||
|
||
def call_ingest(source_path: str, slug: str = None) -> dict:
|
||
"""调用 ingest.py,返回结果"""
|
||
cmd = [sys.executable, str(REPO_ROOT / "tools" / "ingest.py"), source_path]
|
||
try:
|
||
result = subprocess.run(
|
||
cmd,
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=300,
|
||
cwd=str(REPO_ROOT),
|
||
)
|
||
return {
|
||
"success": result.returncode == 0,
|
||
"stdout": result.stdout,
|
||
"stderr": result.stderr,
|
||
}
|
||
except subprocess.TimeoutExpired:
|
||
return {"success": False, "stdout": "", "stderr": "Timeout (>5min)"}
|
||
except Exception as e:
|
||
return {"success": False, "stdout": "", "stderr": str(e)}
|
||
|
||
|
||
def find_orphan_entity_concept(manifest: dict) -> tuple[list, list]:
|
||
"""检测未被任何 source page 引用的 entity 和 concept"""
|
||
# 从所有 source 内容中提取 [[wikilinks]]
|
||
import re
|
||
wikilink_pattern = re.compile(r"\[\[([^\]]+)\]\]")
|
||
|
||
sources_dir = WIKI_DIR / "sources"
|
||
referenced_entities = set()
|
||
referenced_concepts = set()
|
||
|
||
if sources_dir.exists():
|
||
for src in sources_dir.glob("*.md"):
|
||
content = src.read_text(encoding="utf-8")
|
||
for link in wikilink_pattern.findall(content):
|
||
name = link.strip()
|
||
if name.startswith("entities/"):
|
||
referenced_entities.add(Path(name).stem)
|
||
elif name.startswith("concepts/"):
|
||
referenced_concepts.add(Path(name).stem)
|
||
elif "/" not in name:
|
||
# 裸 wikilink,可能是 entity 或 concept
|
||
referenced_entities.add(name)
|
||
referenced_concepts.add(name)
|
||
|
||
# 检查 entity 目录
|
||
orphan_entities = []
|
||
entities_dir = WIKI_DIR / "entities"
|
||
if entities_dir.exists():
|
||
for f in entities_dir.glob("*.md"):
|
||
if f.stem not in referenced_entities:
|
||
orphan_entities.append(f.name)
|
||
|
||
# 检查 concept 目录
|
||
orphan_concepts = []
|
||
concepts_dir = WIKI_DIR / "concepts"
|
||
if concepts_dir.exists():
|
||
for f in concepts_dir.glob("*.md"):
|
||
if f.stem not in referenced_concepts:
|
||
orphan_concepts.append(f.name)
|
||
|
||
return orphan_entities, orphan_concepts
|
||
|
||
|
||
# ─── 核心同步逻辑 ───────────────────────────────────────────────
|
||
|
||
def check_changes(manifest: dict, raw_files: dict) -> dict:
|
||
"""对比 manifest 和实际 raw 文件,返回变化"""
|
||
changes = {"new": [], "updated": [], "deleted": [], "unchanged": []}
|
||
manifest_files = manifest.get("files", {})
|
||
|
||
# 遍历当前 raw 文件
|
||
for rel_path, info in raw_files.items():
|
||
if rel_path not in manifest_files:
|
||
changes["new"].append({"rel_path": rel_path, **info})
|
||
elif info["hash"] != manifest_files[rel_path]["hash"]:
|
||
changes["updated"].append({
|
||
"rel_path": rel_path,
|
||
"old_hash": manifest_files[rel_path]["hash"],
|
||
**info,
|
||
})
|
||
else:
|
||
changes["unchanged"].append(rel_path)
|
||
|
||
# 遍历 manifest,找已删除的
|
||
for rel_path in manifest_files:
|
||
abs_path = REPO_ROOT / rel_path
|
||
if not abs_path.exists():
|
||
changes["deleted"].append({
|
||
"rel_path": rel_path,
|
||
"slug": manifest_files[rel_path].get("slug", build_slug_from_path(rel_path)),
|
||
"source_path": manifest_files[rel_path].get("source_path"),
|
||
})
|
||
|
||
return changes
|
||
|
||
|
||
def run_sync(dry_run: bool = False, verbose: bool = False):
|
||
print(f"\n{bold('=== Wiki Sync')}\n")
|
||
print(f" Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
||
print(f" Raw: {REPO_ROOT / 'raw'}")
|
||
print(f" Wiki: {WIKI_DIR}")
|
||
print(f" Mode: {'DRY-RUN (preview only)' if dry_run else 'LIVE SYNC'}")
|
||
print()
|
||
|
||
# Step 1: load manifest
|
||
manifest = load_manifest()
|
||
log("manifest.json loaded", "info")
|
||
|
||
# Step 2: scan raw/
|
||
raw_files = scan_raw()
|
||
log(f"raw/ scan: {len(raw_files)} .md files found", "info")
|
||
|
||
# Step 3: check changes
|
||
changes = check_changes(manifest, raw_files)
|
||
total_changes = len(changes["new"]) + len(changes["updated"]) + len(changes["deleted"])
|
||
|
||
if total_changes == 0:
|
||
log("No changes detected — wiki is up to date.", "success")
|
||
return
|
||
|
||
# ─── Report ───
|
||
print(f"\n{bold('--- Changes ---')}")
|
||
print(f" {green('+')} New: {len(changes['new'])}")
|
||
print(f" {yellow('~')} Updated: {len(changes['updated'])}")
|
||
print(f" {red('-')} Deleted: {len(changes['deleted'])}")
|
||
|
||
if verbose or not dry_run:
|
||
if changes["new"]:
|
||
print(f"\n {bold('New Files:')}")
|
||
for f in changes["new"]:
|
||
log(f"{green('[+')} {f['rel_path']}", "normal")
|
||
|
||
if changes["updated"]:
|
||
print(f"\n {bold('Updated Files:')}")
|
||
for f in changes["updated"]:
|
||
log(f"{yellow('[~]')} {f['rel_path']} (hash changed)", "normal")
|
||
|
||
if changes["deleted"]:
|
||
print(f"\n {bold('Deleted Files:')}")
|
||
for f in changes["deleted"]:
|
||
log(f"{red('[-]')} {f['rel_path']}", "normal")
|
||
|
||
if dry_run:
|
||
log("\nDry-run complete. Run with --sync to apply.", "warn")
|
||
return
|
||
|
||
# ─── Apply Sync ───
|
||
print(f"\n{bold('--- Applying Sync ---')}")
|
||
|
||
updated_manifest = manifest.copy()
|
||
updated_manifest["files"] = manifest.get("files", {}).copy()
|
||
|
||
# ① 新增 → ingest
|
||
for f in changes["new"]:
|
||
rel_path = f["rel_path"]
|
||
abs_path = f["abs_path"]
|
||
slug = build_slug_from_path(rel_path)
|
||
print(f"\n {green('[+]')} New: {rel_path}")
|
||
print(f" slug: {slug}")
|
||
|
||
result = call_ingest(abs_path, slug)
|
||
if result["success"]:
|
||
log(f"Ingested: {slug}.md", "success")
|
||
updated_manifest["files"][rel_path] = {
|
||
"hash": f["hash"],
|
||
"modified": f["modified"],
|
||
"slug": slug,
|
||
"source_path": f"wiki/sources/{slug}.md",
|
||
"ingested": True,
|
||
"ingested_at": iso_now(),
|
||
}
|
||
else:
|
||
log(f"Failed: {result['stderr'][:200]}", "error")
|
||
# 仍然记录(避免重复 ingest)
|
||
updated_manifest["files"][rel_path] = {
|
||
"hash": f["hash"],
|
||
"modified": f["modified"],
|
||
"slug": slug,
|
||
"source_path": f"wiki/sources/{slug}.md",
|
||
"ingested": False,
|
||
"ingested_at": None,
|
||
"error": result["stderr"][:500],
|
||
}
|
||
|
||
# ② 修改 → re-ingest
|
||
for f in changes["updated"]:
|
||
rel_path = f["rel_path"]
|
||
abs_path = f["abs_path"]
|
||
old_slug = manifest["files"].get(rel_path, {}).get("slug") or build_slug_from_path(rel_path)
|
||
print(f"\n {yellow('[~]')} Updated: {rel_path}")
|
||
|
||
result = call_ingest(abs_path, old_slug)
|
||
if result["success"]:
|
||
log(f"Re-ingested: {old_slug}.md", "success")
|
||
updated_manifest["files"][rel_path] = {
|
||
**updated_manifest["files"].get(rel_path, {}),
|
||
"hash": f["hash"],
|
||
"modified": f["modified"],
|
||
"slug": old_slug,
|
||
"source_path": f"wiki/sources/{old_slug}.md",
|
||
"ingested": True,
|
||
"ingested_at": iso_now(),
|
||
}
|
||
else:
|
||
log(f"Failed: {result['stderr'][:200]}", "error")
|
||
|
||
# ③ 删除 → 保留 wiki 内容,仅从 manifest 移除(按用户要求保留 orphan)
|
||
for f in changes["deleted"]:
|
||
rel_path = f["rel_path"]
|
||
source_path = f.get("source_path")
|
||
print(f"\n {red('[-]')} Deleted: {rel_path}")
|
||
if source_path:
|
||
sp = WIKI_DIR / source_path
|
||
log(f" Wiki source kept: {sp}", "warn")
|
||
# 从 manifest 移除(不删除 wiki 文件)
|
||
if rel_path in updated_manifest["files"]:
|
||
del updated_manifest["files"][rel_path]
|
||
|
||
# Step 4: Save manifest
|
||
save_manifest(updated_manifest)
|
||
log(f"\nmanifest.json updated ({len(updated_manifest['files'])} entries)", "success")
|
||
|
||
# Step 5: Orphan detection
|
||
orphan_entities, orphan_concepts = find_orphan_entity_concept(updated_manifest)
|
||
if orphan_entities or orphan_concepts:
|
||
print(f"\n{bold('--- Orphan Report (kept as requested) ---')}")
|
||
if orphan_entities:
|
||
print(f" {bold('Orphan Entities')} ({len(orphan_entities)}):")
|
||
for e in sorted(orphan_entities):
|
||
print(f" {dim('?')} {e}")
|
||
if orphan_concepts:
|
||
print(f" {bold('Orphan Concepts')} ({len(orphan_concepts)}):")
|
||
for c in sorted(orphan_concepts):
|
||
print(f" {dim('?')} {c}")
|
||
log("\nOrphan pages are kept (not deleted per user request).", "info")
|
||
else:
|
||
log("No orphan entity/concept detected.", "success")
|
||
|
||
print(f"\n{bold('Done.')}")
|
||
|
||
|
||
def run_bootstrap():
|
||
"""从现有 wiki sources 反向生成 manifest,跳过已 ingest 的文件"""
|
||
import re
|
||
|
||
print(f"\n{bold('=== Wiki Bootstrap')}\n")
|
||
print(f" Scanning existing wiki sources to build manifest ...\n")
|
||
|
||
sources_dir = WIKI_DIR / "sources"
|
||
if not sources_dir.exists():
|
||
print(f" {red('✗')} No wiki/sources/ directory found. Nothing to bootstrap.")
|
||
return
|
||
|
||
wikilink_pattern = re.compile(r"\[\[?raw/([^\]\s]+\.md)\]?]?", re.IGNORECASE)
|
||
|
||
manifest = {"version": 1, "updated_at": iso_now(), "files": {}}
|
||
raw_dir = (REPO_ROOT / "raw").resolve() # 解析 symlink 到真实路径
|
||
repo_raw_prefix = str(REPO_ROOT / "raw") # 用于 strip 前缀得到相对路径
|
||
bootstrapped = 0
|
||
skipped_not_found = 0
|
||
skipped_no_source_field = 0
|
||
|
||
for src in sources_dir.glob("*.md"):
|
||
content = src.read_text(encoding="utf-8")
|
||
|
||
# 尝试从 ## Source File 字段提取原始路径
|
||
match = wikilink_pattern.search(content)
|
||
if not match:
|
||
skipped_no_source_field += 1
|
||
continue
|
||
|
||
# raw_rel 格式如 "Agent/usecases/xxx.md"(不含 raw/ 前缀)
|
||
raw_rel = match.group(1).lstrip("/")
|
||
# 用 resolved 后的 raw_dir 拼接(follow symlink)
|
||
raw_path = raw_dir / raw_rel
|
||
|
||
if not raw_path.exists():
|
||
# 文件已删除,保留 source page 但不加入 manifest
|
||
skipped_not_found += 1
|
||
continue
|
||
|
||
stat = raw_path.stat()
|
||
file_hash = sha256_file(raw_path)
|
||
slug = src.stem
|
||
|
||
# manifest key 用 "raw/Agent/xxx.md" 格式(REPO_ROOT 相对路径)
|
||
manifest_key = f"raw/{raw_rel}"
|
||
manifest["files"][manifest_key] = {
|
||
"hash": file_hash,
|
||
"modified": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
|
||
"slug": slug,
|
||
"source_path": f"wiki/sources/{slug}.md",
|
||
"ingested": True,
|
||
"ingested_at": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
|
||
}
|
||
bootstrapped += 1
|
||
|
||
save_manifest(manifest)
|
||
|
||
print(f" {bold('Result:')}")
|
||
print(f" {green('✓')} Manifest entries created: {bootstrapped}")
|
||
print(f" {yellow('~')} Skipped (source file deleted): {skipped_not_found}")
|
||
print(f" {dim('-')} Skipped (no source_file field): {skipped_no_source_field}")
|
||
print(f"\n {green('✓')} manifest.json created at: {MANIFEST_FILE}")
|
||
print(f"\n Run now: {bold('python tools/sync.py --check')} to preview new/updated files.\n")
|
||
|
||
|
||
def run_check():
|
||
"""只预览变化,不执行"""
|
||
manifest = load_manifest()
|
||
raw_files = scan_raw()
|
||
changes = check_changes(manifest, raw_files)
|
||
total = len(changes["new"]) + len(changes["updated"]) + len(changes["deleted"])
|
||
|
||
print(f"\n{bold('=== Wiki Sync Check')} (preview mode)\n")
|
||
print(f" Raw files: {len(raw_files)}")
|
||
print(f" Manifest entries: {len(manifest.get('files', {}))}")
|
||
print(f" {green('+')} New: {len(changes['new'])}")
|
||
print(f" {yellow('~')} Updated: {len(changes['updated'])}")
|
||
print(f" {red('-')} Deleted: {len(changes['deleted'])}")
|
||
|
||
if total > 0:
|
||
if changes["new"]:
|
||
print(f"\n {bold('New Files:')}")
|
||
for f in changes["new"]:
|
||
print(f" {green('[+]')} {f['rel_path']}")
|
||
if changes["updated"]:
|
||
print(f"\n {bold('Updated Files:')}")
|
||
for f in changes["updated"]:
|
||
print(f" {yellow('[~]')} {f['rel_path']} (was {f['old_hash']}, now {f['hash']})")
|
||
if changes["deleted"]:
|
||
print(f"\n {bold('Deleted Files:')}")
|
||
for f in changes["deleted"]:
|
||
print(f" {red('[-]')} {f['rel_path']}")
|
||
else:
|
||
print(f"\n {green('No changes — wiki is in sync.')}")
|
||
|
||
print()
|
||
|
||
|
||
def run_rebuild():
|
||
"""从 manifest 重建 wiki/index.md(兜底方案)"""
|
||
manifest = load_manifest()
|
||
print(f"\n{bold('=== Wiki Rebuild from Manifest')}\n")
|
||
print(f" Manifest entries: {len(manifest.get('files', {}))}")
|
||
print(f" Rebuilding index.md ...\n")
|
||
|
||
index_lines = [
|
||
"# Wiki Index\n",
|
||
"\n## Overview\n",
|
||
"- [Overview](overview.md) — living synthesis\n",
|
||
"\n## Sources\n",
|
||
]
|
||
|
||
files = manifest.get("files", {})
|
||
# 按 modified 时间倒序
|
||
sorted_files = sorted(files.items(), key=lambda x: x[1].get("modified", ""), reverse=True)
|
||
|
||
for rel_path, info in sorted_files:
|
||
slug = info.get("slug", build_slug_from_path(rel_path))
|
||
source_md_path = WIKI_DIR / "sources" / f"{slug}.md"
|
||
if source_md_path.exists():
|
||
title = source_md_path.read_text(encoding="utf-8").split("\n")[0].lstrip("# ").strip()
|
||
index_lines.append(f"- [{title}](sources/{slug}.md)\n")
|
||
else:
|
||
index_lines.append(f"- [{slug}](sources/{slug}.md) — (source missing)\n")
|
||
|
||
index_lines.append("\n## Entities\n\n## Concepts\n\n## Syntheses\n")
|
||
|
||
index_file = WIKI_DIR / "index.md"
|
||
index_file.write_text("".join(index_lines), encoding="utf-8")
|
||
print(f" {green('✓')} index.md rebuilt with {len(sorted_files)} sources")
|
||
|
||
# Orphan report
|
||
orphan_entities, orphan_concepts = find_orphan_entity_concept(manifest)
|
||
if orphan_entities:
|
||
print(f" {dim('?')} Orphan entities: {len(orphan_entities)}")
|
||
if orphan_concepts:
|
||
print(f" {dim('?')} Orphan concepts: {len(orphan_concepts)}")
|
||
|
||
print(f"\nDone.")
|
||
|
||
|
||
# ─── CLI 入口 ───────────────────────────────────────────────
|
||
|
||
if __name__ == "__main__":
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(
|
||
description="Wiki ↔ Raw 三向同步工具",
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
)
|
||
parser.add_argument(
|
||
"--check",
|
||
action="store_true",
|
||
help="预览变化,不执行同步",
|
||
)
|
||
parser.add_argument(
|
||
"--sync",
|
||
action="store_true",
|
||
help="执行完整同步(新增/修改/删除 + orphan 检测)",
|
||
)
|
||
parser.add_argument(
|
||
"--rebuild",
|
||
action="store_true",
|
||
help="从 manifest 重建 wiki/index.md(兜底方案)",
|
||
)
|
||
parser.add_argument(
|
||
"--bootstrap",
|
||
action="store_true",
|
||
help="从现有 wiki sources 反向生成 manifest(首次使用,跳过已 ingest 的文件)",
|
||
)
|
||
parser.add_argument(
|
||
"--verbose", "-v",
|
||
action="store_true",
|
||
help="详细输出",
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
if args.bootstrap:
|
||
run_bootstrap()
|
||
elif args.rebuild:
|
||
run_rebuild()
|
||
elif args.check:
|
||
run_check()
|
||
elif args.sync:
|
||
run_sync(dry_run=False, verbose=args.verbose)
|
||
else:
|
||
parser.print_help()
|
||
print("\n示例:")
|
||
print(" python tools/sync.py --check # 预览变化")
|
||
print(" python tools/sync.py --sync # 执行同步")
|
||
print(" python tools/sync.py --sync -v # 详细模式")
|
||
print(" python tools/sync.py --rebuild # 重建 index")
|
||
print(" python tools/sync.py --bootstrap # 首次:从 wiki sources 生成 manifest")
|