Files
llm-wiki-agent/tools/sync.py
weishen a31d28a386 feat(wiki): ingest Docker image transfer guide
- Create source page with complete documentation
- Add Docker-Image, Docker-Save, Docker-Load concept pages
- Update Docker entity with new source reference
- Update log.md entry
2026-04-21 14:19:46 +08:00

577 lines
20 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Wiki ↔ Raw 三向同步工具
功能:
- 检测 raw/ 下文件变化(新增/修改/删除)
- 维护 manifest.json 状态映射
- 检测 orphan entity/concept仅报告不删除
用法:
python tools/sync.py --check 预览变化(不执行)
python tools/sync.py --sync 执行同步(更新 manifest
python tools/sync.py --pending 显示待处理文件列表
python tools/sync.py --json JSON 行输出(供程序消费)
python tools/sync.py --rebuild 从 manifest 重建 wiki/index兜底
manifest.json 格式:
{
"version": 1,
"updated_at": "ISO timestamp",
"files": {
"relative/path/to/file.md": {
"hash": "sha256",
"modified": "ISO timestamp",
"slug": "wiki-source-slug",
"source_path": "wiki/sources/slug.md",
"ingested": true
}
}
}
"""
import json
import hashlib
import argparse
from pathlib import Path
from datetime import datetime, timezone
REPO_ROOT = Path(__file__).parent.parent.resolve()
WIKI_DIR = REPO_ROOT / "wiki"
MANIFEST_FILE = Path(__file__).parent / "manifest.json"
# ─── 工具函数 ───────────────────────────────────────────────
def green(text):
return f"\033[92m{text}\033[0m"
def yellow(text):
return f"\033[93m{text}\033[0m"
def red(text):
return f"\033[91m{text}\033[0m"
def dim(text):
return f"\033[2m{text}\033[0m"
def bold(text):
return f"\033[1m{text}\033[0m"
def log(msg, style="normal"):
prefixes = {
"normal": " ",
"info": " ",
"success": "",
"warn": "",
"error": "",
"section": "\n── ",
}
print(f"{prefixes.get(style, ' ')}{msg}")
def sha256_file(path: Path) -> str:
h = hashlib.sha256()
h.update(path.read_bytes())
return h.hexdigest()[:16]
def iso_now():
return datetime.now(timezone.utc).isoformat()
def load_manifest() -> dict:
if MANIFEST_FILE.exists():
try:
return json.loads(MANIFEST_FILE.read_text(encoding="utf-8"))
except (json.JSONDecodeError, IOError):
pass
return {"version": 1, "updated_at": iso_now(), "files": {}}
def save_manifest(manifest: dict):
manifest["updated_at"] = iso_now()
MANIFEST_FILE.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
def scan_raw() -> dict[str, dict]:
"""返回 {relative_path: {hash, modified, size}}"""
raw_dir = REPO_ROOT / "raw"
result = {}
if not raw_dir.exists():
return result
for p in raw_dir.rglob("*.md"):
if p.is_file() and not p.name.startswith("."):
rel = str(p.relative_to(REPO_ROOT))
stat = p.stat()
result[rel] = {
"hash": sha256_file(p),
"modified": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
"size": stat.st_size,
"abs_path": str(p),
}
return result
def build_slug_from_path(rel_path: str) -> str:
"""从相对路径生成 slug尽量保留中文kebab-case"""
name = Path(rel_path).stem
name = name.replace(" ", "-").replace("/", "-").replace("\\", "-")
name = "".join(c if c.isalnum() or c in ("-", "_", "·") else "-" for c in name)
name = name.strip("-")
return name or "untitled"
def find_orphan_entity_concept(manifest: dict) -> tuple[list, list]:
"""检测未被任何 source page 引用的 entity 和 concept"""
import re
wikilink_pattern = re.compile(r"\[\[([^\]]+)\]\]")
sources_dir = WIKI_DIR / "sources"
referenced_entities = set()
referenced_concepts = set()
if sources_dir.exists():
for src in sources_dir.glob("*.md"):
content = src.read_text(encoding="utf-8")
for link in wikilink_pattern.findall(content):
name = link.strip()
if name.startswith("entities/"):
referenced_entities.add(Path(name).stem)
elif name.startswith("concepts/"):
referenced_concepts.add(Path(name).stem)
elif "/" not in name:
referenced_entities.add(name)
referenced_concepts.add(name)
orphan_entities = []
entities_dir = WIKI_DIR / "entities"
if entities_dir.exists():
for f in entities_dir.glob("*.md"):
if f.stem not in referenced_entities:
orphan_entities.append(f.name)
orphan_concepts = []
concepts_dir = WIKI_DIR / "concepts"
if concepts_dir.exists():
for f in concepts_dir.glob("*.md"):
if f.stem not in referenced_concepts:
orphan_concepts.append(f.name)
return orphan_entities, orphan_concepts
# ─── 核心同步逻辑 ───────────────────────────────────────────────
def check_changes(manifest: dict, raw_files: dict) -> dict:
"""对比 manifest 和实际 raw 文件,返回变化"""
changes = {"new": [], "updated": [], "deleted": [], "unchanged": []}
manifest_files = manifest.get("files", {})
for rel_path, info in raw_files.items():
if rel_path not in manifest_files:
changes["new"].append({"rel_path": rel_path, **info})
elif info["hash"] != manifest_files[rel_path]["hash"]:
changes["updated"].append({
"rel_path": rel_path,
"old_hash": manifest_files[rel_path]["hash"],
**info,
})
else:
changes["unchanged"].append(rel_path)
for rel_path in manifest_files:
abs_path = REPO_ROOT / rel_path
if not abs_path.exists():
changes["deleted"].append({
"rel_path": rel_path,
"slug": manifest_files[rel_path].get("slug", build_slug_from_path(rel_path)),
"source_path": manifest_files[rel_path].get("source_path"),
})
return changes
def run_sync(dry_run: bool = False, verbose: bool = False, json_mode: bool = False):
print(f"\n{bold('=== Wiki Sync')}\n", end="")
print(f" Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
print(f" Raw: {REPO_ROOT / 'raw'}\n")
print(f" Wiki: {WIKI_DIR}\n")
print(f" Mode: {'DRY-RUN (preview only)' if dry_run else 'LIVE SYNC'}")
manifest = load_manifest()
log("manifest.json loaded", "info")
raw_files = scan_raw()
log(f"raw/ scan: {len(raw_files)} .md files found", "info")
changes = check_changes(manifest, raw_files)
total_changes = len(changes["new"]) + len(changes["updated"]) + len(changes["deleted"])
if total_changes == 0:
log("No changes detected — wiki is up to date.", "success")
return
if not json_mode:
print(f"\n{bold('--- Changes ---')}")
print(f" {green('+')} New: {len(changes['new'])}")
print(f" {yellow('~')} Updated: {len(changes['updated'])}")
print(f" {red('-')} Deleted: {len(changes['deleted'])}")
if verbose or not dry_run:
if changes["new"]:
if not json_mode:
print(f"\n {bold('New Files:')}")
for f in changes["new"]:
slug = build_slug_from_path(f["rel_path"])
if json_mode:
print(json.dumps({"event": "new_detected", "rel_path": f["rel_path"], "slug": slug}))
else:
log(f"{green('[+')} {f['rel_path']}", "normal")
if changes["updated"]:
if not json_mode:
print(f"\n {bold('Updated Files:')}")
for f in changes["updated"]:
slug = manifest["files"].get(f["rel_path"], {}).get("slug") or build_slug_from_path(f["rel_path"])
if json_mode:
print(json.dumps({"event": "updated_detected", "rel_path": f["rel_path"], "slug": slug}))
else:
log(f"{yellow('[~]')} {f['rel_path']} (hash changed)", "normal")
if changes["deleted"]:
if not json_mode:
print(f"\n {bold('Deleted Files:')}")
for f in changes["deleted"]:
if json_mode:
print(json.dumps({"event": "deleted_detected", "rel_path": f["rel_path"]}))
else:
log(f"{red('[-]')} {f['rel_path']}", "normal")
if dry_run:
log("\nDry-run complete. Run with --sync to apply.", "warn")
return
# ─── Apply Sync ───
if not json_mode:
print(f"\n{bold('--- Applying Sync ---')}")
updated_manifest = manifest.copy()
updated_manifest["files"] = manifest.get("files", {}).copy()
pending_files = []
# ① 新增 → 加入 manifest
for f in changes["new"]:
rel_path = f["rel_path"]
slug = build_slug_from_path(rel_path)
if json_mode:
print(json.dumps({"event": "pending", "rel_path": rel_path, "slug": slug, "action": "new"}))
pending_files.append({"rel_path": rel_path, "abs_path": f["abs_path"], "slug": slug, "action": "new"})
updated_manifest["files"][rel_path] = {
"hash": f["hash"],
"modified": f.get("modified"),
"slug": slug,
"source_path": f"wiki/sources/{slug}.md",
"ingested": False,
"ingested_at": None,
}
# ② 修改 → 更新 manifest
for f in changes["updated"]:
rel_path = f["rel_path"]
old_entry = manifest["files"].get(rel_path, {})
slug = old_entry.get("slug") or build_slug_from_path(rel_path)
if json_mode:
print(json.dumps({"event": "pending", "rel_path": rel_path, "slug": slug, "action": "updated"}))
pending_files.append({"rel_path": rel_path, "abs_path": f["abs_path"], "slug": slug, "action": "updated"})
updated_manifest["files"][rel_path] = {
**old_entry,
"hash": f["hash"],
"modified": f.get("modified"),
"ingested": False,
"ingested_at": None,
}
# ③ 删除 → 保留 wiki 内容,从 manifest 移除
deleted_files = []
for f in changes["deleted"]:
rel_path = f["rel_path"]
source_path = f.get("source_path")
if not json_mode:
log(f"Deleted: {rel_path}", "warn")
if source_path:
sp = WIKI_DIR / source_path
log(f" Wiki source kept: {sp}", "warn")
if rel_path in updated_manifest["files"]:
del updated_manifest["files"][rel_path]
deleted_files.append(rel_path)
# 保存 manifest
save_manifest(updated_manifest)
log(f"\nmanifest.json updated ({len(updated_manifest['files'])} entries)", "success")
if json_mode:
print(json.dumps({
"event": "sync_complete",
"summary": {
"pending": len(pending_files),
"deleted": len(deleted_files),
"manifest_entries": len(updated_manifest["files"]),
},
"pending_files": pending_files,
"deleted_files": deleted_files,
}))
# Orphan detection
orphan_entities, orphan_concepts = find_orphan_entity_concept(updated_manifest)
if not json_mode:
if orphan_entities or orphan_concepts:
print(f"\n{bold('--- Orphan Report (kept as requested) ---')}")
if orphan_entities:
print(f" {bold('Orphan Entities')} ({len(orphan_entities)}):")
for e in sorted(orphan_entities):
print(f" {dim('?')} {e}")
if orphan_concepts:
print(f" {bold('Orphan Concepts')} ({len(orphan_concepts)}):")
for c in sorted(orphan_concepts):
print(f" {dim('?')} {c}")
log("\nOrphan pages are kept (not deleted per user request).", "info")
else:
log("No orphan entity/concept detected.", "success")
print(f"\n{bold('Done.')}")
print(f"\n Pending files for ingestion: {len(pending_files)}")
def run_check():
"""只预览变化,不执行(输出为标准 Markdown"""
manifest = load_manifest()
raw_files = scan_raw()
changes = check_changes(manifest, raw_files)
total = len(changes["new"]) + len(changes["updated"]) + len(changes["deleted"])
# Markdown header and summary
print("# Wiki Sync Check\n")
print(f"- Raw files: {len(raw_files)}")
print(f"- Manifest entries: {len(manifest.get('files', {}))}")
print(f"- New: {len(changes['new'])}")
print(f"- Updated: {len(changes['updated'])}")
print(f"- Deleted: {len(changes['deleted'])}\n")
if total > 0:
if changes["new"]:
print("## New Files")
for f in changes["new"]:
print(f"- {f['rel_path']}")
print()
if changes["updated"]:
print("## Updated Files")
for f in changes["updated"]:
print(f"- {f['rel_path']} (was {f['old_hash']}, now {f['hash']})")
print()
if changes["deleted"]:
print("## Deleted Files")
for f in changes["deleted"]:
print(f"- {f['rel_path']}")
print()
else:
print("No changes — wiki is in sync.\n")
def run_rebuild():
"""从 manifest 重建 wiki/index.md兜底方案"""
manifest = load_manifest()
print(f"\n{bold('=== Wiki Rebuild from Manifest')}\n")
print(f" Manifest entries: {len(manifest.get('files', {}))}")
print(f" Rebuilding index.md ...\n")
index_lines = [
"# Wiki Index\n",
"\n## Overview\n",
"- [Overview](overview.md) — living synthesis\n",
"\n## Sources\n",
]
files = manifest.get("files", {})
sorted_files = sorted(files.items(), key=lambda x: x[1].get("modified", ""), reverse=True)
for rel_path, info in sorted_files:
slug = info.get("slug", build_slug_from_path(rel_path))
source_md_path = WIKI_DIR / "sources" / f"{slug}.md"
if source_md_path.exists():
title = source_md_path.read_text(encoding="utf-8").split("\n")[0].lstrip("# ").strip()
index_lines.append(f"- [{title}](sources/{slug}.md)\n")
else:
index_lines.append(f"- [{slug}](sources/{slug}.md) — (source missing)\n")
index_lines.append("\n## Entities\n\n## Concepts\n\n## Syntheses\n")
index_file = WIKI_DIR / "index.md"
index_file.write_text("".join(index_lines), encoding="utf-8")
print(f" {green('')} index.md rebuilt with {len(sorted_files)} sources")
orphan_entities, orphan_concepts = find_orphan_entity_concept(manifest)
if orphan_entities:
print(f" {dim('?')} Orphan entities: {len(orphan_entities)}")
if orphan_concepts:
print(f" {dim('?')} Orphan concepts: {len(orphan_concepts)}")
print(f"\nDone.")
# ─── CLI 入口 ───────────────────────────────────────────────
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Wiki ↔ Raw 三向同步工具",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--check",
action="store_true",
help="预览变化,不执行同步",
)
parser.add_argument(
"--sync",
action="store_true",
help="执行完整同步(新增/修改/删除 + orphan 检测)",
)
parser.add_argument(
"--rebuild",
action="store_true",
help="从 manifest 重建 wiki/index.md兜底方案",
)
parser.add_argument(
"--reset-failed",
action="store_true",
help="重置所有 failed 的 ingest 状态(让它们重新待处理)",
)
parser.add_argument(
"--pending",
action="store_true",
help="列出所有待摄取的 pending 文件",
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="详细输出",
)
parser.add_argument(
"--json",
action="store_true",
help="JSON 行输出模式(供调用方解析)",
)
parser.add_argument(
"--limit",
type=int,
default=None,
help="与 --pending --json 配合使用限制返回的条目数1 返回单条,>1 返回多条)。默认不限制(返回全部)。",
)
args = parser.parse_args()
if args.rebuild:
run_rebuild()
elif args.pending:
manifest = load_manifest()
pending = [(k, v) for k, v in manifest["files"].items() if not v.get("ingested")]
if args.json:
total = len(pending)
# 未指定 limit -> 返回全部files 列表)
if args.limit is None:
payload = {
"event": "pending_list",
"count": total,
"files": [
{
"rel_path": k,
"slug": v.get("slug", build_slug_from_path(k)),
"source_path": v.get("source_path"),
"modified": v.get("modified"),
"hash": v.get("hash"),
}
for k, v in pending
],
}
elif args.limit <= 0:
payload = {"event": "pending_list", "count": total, "files": []}
elif args.limit == 1:
first = pending[0] if pending else (None, None)
if first[0] is None:
payload = {"event": "pending_list", "count": 0, "file": None}
else:
k, v = first
payload = {
"event": "pending_list",
"count": total,
"file": {
"rel_path": k,
"slug": v.get("slug", build_slug_from_path(k)),
"source_path": v.get("source_path"),
"modified": v.get("modified"),
"hash": v.get("hash"),
},
}
else:
# 返回前 N 条 as files array
n = min(args.limit, total)
payload = {
"event": "pending_list",
"count": total,
"files": [
{
"rel_path": k,
"slug": v.get("slug", build_slug_from_path(k)),
"source_path": v.get("source_path"),
"modified": v.get("modified"),
"hash": v.get("hash"),
}
for k, v in pending[:n]
],
}
print(json.dumps(payload))
else:
# 控制台输出也支持 --limit
total = len(pending)
n = total if args.limit is None else max(0, args.limit)
print(f"=== Pending Ingest Files ({total}) ===\n")
if n == 0:
print(" (no items to show)")
else:
for i, (path, info) in enumerate(pending[:n], 1):
print(f"{i:3}. {path}")
elif args.reset_failed:
manifest = load_manifest()
reset_count = 0
for k, v in manifest["files"].items():
if v.get("error"):
v["ingested"] = False
v.pop("error", None)
v.pop("ingested_at", None)
reset_count += 1
if reset_count > 0:
save_manifest(manifest)
print(f"Reset {reset_count} failed entries to pending.")
else:
print("No failed entries found.")
elif args.check:
run_check()
elif args.sync:
run_sync(dry_run=False, verbose=args.verbose, json_mode=args.json)
else:
parser.print_help()
print("\n示例:")
print(" python tools/sync.py --check # 预览变化")
print(" python tools/sync.py --sync # 执行同步")
print(" python tools/sync.py --sync -v # 详细模式")
print(" python tools/sync.py --rebuild # 重建 index")