fix rebuild
This commit is contained in:
369
tools/sync.py
369
tools/sync.py
@@ -195,84 +195,60 @@ def check_changes(manifest: dict, raw_files: dict) -> dict:
|
||||
|
||||
|
||||
def run_sync(dry_run: bool = False, verbose: bool = False, json_mode: bool = False):
|
||||
print(f"\n{bold('=== Wiki Sync')}\n", end="")
|
||||
print(f" Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
|
||||
print(f" Raw: {REPO_ROOT / 'raw'}\n")
|
||||
print(f" Wiki: {WIKI_DIR}\n")
|
||||
print(f" Mode: {'DRY-RUN (preview only)' if dry_run else 'LIVE SYNC'}")
|
||||
"""执行同步并尽量保持输出精简。
|
||||
|
||||
- 默认(非 verbose、非 json)只会输出一行变化摘要 + manifest 更新成功提示。
|
||||
- verbose=True 会打印每个新增/更新/删除的文件列表(保留旧行为)。
|
||||
- json_mode=True 保持原有的机器友好 JSON 流输出。
|
||||
"""
|
||||
manifest = load_manifest()
|
||||
log("manifest.json loaded", "info")
|
||||
|
||||
raw_files = scan_raw()
|
||||
log(f"raw/ scan: {len(raw_files)} .md files found", "info")
|
||||
|
||||
changes = check_changes(manifest, raw_files)
|
||||
total_changes = len(changes["new"]) + len(changes["updated"]) + len(changes["deleted"])
|
||||
new = changes["new"]
|
||||
updated = changes["updated"]
|
||||
deleted = changes["deleted"]
|
||||
total_changes = len(new) + len(updated) + len(deleted)
|
||||
|
||||
if total_changes == 0:
|
||||
log("No changes detected — wiki is up to date.", "success")
|
||||
if json_mode:
|
||||
print(json.dumps({"event": "sync_complete", "summary": {"pending": 0, "deleted": 0, "manifest_entries": len(manifest.get("files", {}))}}))
|
||||
else:
|
||||
log("No changes detected — wiki is up to date.", "success")
|
||||
return
|
||||
|
||||
# 非 JSON:简短摘要(默认)或详细列表(verbose)
|
||||
if not json_mode:
|
||||
print(f"\n{bold('--- Changes ---')}")
|
||||
print(f" {green('+')} New: {len(changes['new'])}")
|
||||
print(f" {yellow('~')} Updated: {len(changes['updated'])}")
|
||||
print(f" {red('-')} Deleted: {len(changes['deleted'])}")
|
||||
|
||||
if verbose or not dry_run:
|
||||
if changes["new"]:
|
||||
if not json_mode:
|
||||
print(f"\n {bold('New Files:')}")
|
||||
for f in changes["new"]:
|
||||
slug = build_slug_from_path(f["rel_path"])
|
||||
if json_mode:
|
||||
print(json.dumps({"event": "new_detected", "rel_path": f["rel_path"], "slug": slug}))
|
||||
else:
|
||||
log(f"{green('[+')} {f['rel_path']}", "normal")
|
||||
|
||||
if changes["updated"]:
|
||||
if not json_mode:
|
||||
print(f"\n {bold('Updated Files:')}")
|
||||
for f in changes["updated"]:
|
||||
slug = manifest["files"].get(f["rel_path"], {}).get("slug") or build_slug_from_path(f["rel_path"])
|
||||
if json_mode:
|
||||
print(json.dumps({"event": "updated_detected", "rel_path": f["rel_path"], "slug": slug}))
|
||||
else:
|
||||
log(f"{yellow('[~]')} {f['rel_path']} (hash changed)", "normal")
|
||||
|
||||
if changes["deleted"]:
|
||||
if not json_mode:
|
||||
print(f"\n {bold('Deleted Files:')}")
|
||||
for f in changes["deleted"]:
|
||||
if json_mode:
|
||||
print(json.dumps({"event": "deleted_detected", "rel_path": f["rel_path"]}))
|
||||
else:
|
||||
log(f"{red('[-]')} {f['rel_path']}", "normal")
|
||||
log(f"Changes detected: +{len(new)} ~{len(updated)} -{len(deleted)}", "info")
|
||||
if verbose:
|
||||
if new:
|
||||
print("\nNew Files:")
|
||||
for f in new:
|
||||
print(f" {f['rel_path']}")
|
||||
if updated:
|
||||
print("\nUpdated Files:")
|
||||
for f in updated:
|
||||
old = f.get("old_hash")
|
||||
print(f" {f['rel_path']}" + (f" (was {old})" if old else ""))
|
||||
if deleted:
|
||||
print("\nDeleted Files:")
|
||||
for f in deleted:
|
||||
print(f" {f['rel_path']}")
|
||||
|
||||
if dry_run:
|
||||
log("\nDry-run complete. Run with --sync to apply.", "warn")
|
||||
log("Dry-run complete. Run with --sync to apply.", "warn")
|
||||
return
|
||||
|
||||
# ─── Apply Sync ───
|
||||
if not json_mode:
|
||||
print(f"\n{bold('--- Applying Sync ---')}")
|
||||
|
||||
# Apply changes (保持原有 manifest 更新逻辑,但抑制逐文件日志,除非 json_mode 或 verbose)
|
||||
updated_manifest = manifest.copy()
|
||||
updated_manifest["files"] = manifest.get("files", {}).copy()
|
||||
|
||||
pending_files = []
|
||||
|
||||
# ① 新增 → 加入 manifest
|
||||
for f in changes["new"]:
|
||||
for f in new:
|
||||
rel_path = f["rel_path"]
|
||||
slug = build_slug_from_path(rel_path)
|
||||
|
||||
if json_mode:
|
||||
print(json.dumps({"event": "pending", "rel_path": rel_path, "slug": slug, "action": "new"}))
|
||||
|
||||
pending_files.append({"rel_path": rel_path, "abs_path": f["abs_path"], "slug": slug, "action": "new"})
|
||||
|
||||
updated_manifest["files"][rel_path] = {
|
||||
"hash": f["hash"],
|
||||
"modified": f.get("modified"),
|
||||
@@ -282,17 +258,13 @@ def run_sync(dry_run: bool = False, verbose: bool = False, json_mode: bool = Fal
|
||||
"ingested_at": None,
|
||||
}
|
||||
|
||||
# ② 修改 → 更新 manifest
|
||||
for f in changes["updated"]:
|
||||
for f in updated:
|
||||
rel_path = f["rel_path"]
|
||||
old_entry = manifest["files"].get(rel_path, {})
|
||||
slug = old_entry.get("slug") or build_slug_from_path(rel_path)
|
||||
|
||||
if json_mode:
|
||||
print(json.dumps({"event": "pending", "rel_path": rel_path, "slug": slug, "action": "updated"}))
|
||||
|
||||
pending_files.append({"rel_path": rel_path, "abs_path": f["abs_path"], "slug": slug, "action": "updated"})
|
||||
|
||||
updated_manifest["files"][rel_path] = {
|
||||
**old_entry,
|
||||
"hash": f["hash"],
|
||||
@@ -301,23 +273,17 @@ def run_sync(dry_run: bool = False, verbose: bool = False, json_mode: bool = Fal
|
||||
"ingested_at": None,
|
||||
}
|
||||
|
||||
# ③ 删除 → 保留 wiki 内容,从 manifest 移除
|
||||
deleted_files = []
|
||||
for f in changes["deleted"]:
|
||||
for f in deleted:
|
||||
rel_path = f["rel_path"]
|
||||
source_path = f.get("source_path")
|
||||
if not json_mode:
|
||||
log(f"Deleted: {rel_path}", "warn")
|
||||
if source_path:
|
||||
sp = WIKI_DIR / source_path
|
||||
log(f" Wiki source kept: {sp}", "warn")
|
||||
if rel_path in updated_manifest["files"]:
|
||||
del updated_manifest["files"][rel_path]
|
||||
deleted_files.append(rel_path)
|
||||
if json_mode and deleted:
|
||||
print(json.dumps({"event": "deleted_detected", "rel_path": rel_path}))
|
||||
|
||||
# 保存 manifest
|
||||
save_manifest(updated_manifest)
|
||||
log(f"\nmanifest.json updated ({len(updated_manifest['files'])} entries)", "success")
|
||||
|
||||
if json_mode:
|
||||
print(json.dumps({
|
||||
@@ -330,26 +296,33 @@ def run_sync(dry_run: bool = False, verbose: bool = False, json_mode: bool = Fal
|
||||
"pending_files": pending_files,
|
||||
"deleted_files": deleted_files,
|
||||
}))
|
||||
else:
|
||||
log(f"manifest.json updated ({len(updated_manifest['files'])} entries)", "success")
|
||||
if verbose:
|
||||
log(f"Pending files for ingestion: {len(pending_files)}", "info")
|
||||
|
||||
# Orphan detection
|
||||
# 简短的 orphan 报告(仅在 verbose 模式下列出详情)
|
||||
orphan_entities, orphan_concepts = find_orphan_entity_concept(updated_manifest)
|
||||
if not json_mode:
|
||||
if orphan_entities or orphan_concepts:
|
||||
print(f"\n{bold('--- Orphan Report (kept as requested) ---')}")
|
||||
if orphan_entities:
|
||||
print(f" {bold('Orphan Entities')} ({len(orphan_entities)}):")
|
||||
for e in sorted(orphan_entities):
|
||||
print(f" {dim('?')} {e}")
|
||||
if orphan_concepts:
|
||||
print(f" {bold('Orphan Concepts')} ({len(orphan_concepts)}):")
|
||||
for c in sorted(orphan_concepts):
|
||||
print(f" {dim('?')} {c}")
|
||||
log("\nOrphan pages are kept (not deleted per user request).", "info")
|
||||
if verbose:
|
||||
print(f"\n{bold('--- Orphan Report (kept as requested) ---')}")
|
||||
if orphan_entities:
|
||||
print(f"Orphan Entities ({len(orphan_entities)}):")
|
||||
for e in sorted(orphan_entities):
|
||||
print(f" {e}")
|
||||
if orphan_concepts:
|
||||
print(f"Orphan Concepts ({len(orphan_concepts)}):")
|
||||
for c in sorted(orphan_concepts):
|
||||
print(f" {c}")
|
||||
else:
|
||||
log(f"Orphan entities: {len(orphan_entities)}; Orphan concepts: {len(orphan_concepts)}", "info")
|
||||
else:
|
||||
log("No orphan entity/concept detected.", "success")
|
||||
if verbose:
|
||||
log("No orphan entity/concept detected.", "success")
|
||||
|
||||
print(f"\n{bold('Done.')}")
|
||||
print(f"\n Pending files for ingestion: {len(pending_files)}")
|
||||
if not json_mode:
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
def run_check():
|
||||
@@ -388,7 +361,17 @@ def run_check():
|
||||
|
||||
|
||||
def run_rebuild():
|
||||
"""从 manifest 重建 wiki/index.md(兜底方案)"""
|
||||
"""从 manifest 重建 wiki/index.md(兜底方案)。
|
||||
|
||||
改进点:
|
||||
- 优先使用 manifest 中记录的 source_path(如果存在且文件真实存在),
|
||||
其次尝试 wiki/sources/<slug>.md;再尝试在 wiki/sources 下做不区分大小写或
|
||||
归一化后的匹配(减少命名差异导致的断链)。
|
||||
- 更健壮地解析 YAML frontmatter 中的 title 字段(支持缺失结束符的容错),
|
||||
并在没有 title 时回退到第一个 Markdown 标题或 slug。
|
||||
- 在无法找到 source 文件时,保留原 slug 并在 index 中标注 (source missing),
|
||||
以便人工排查。
|
||||
"""
|
||||
manifest = load_manifest()
|
||||
print(f"\n{bold('=== Wiki Rebuild from Manifest')}\n")
|
||||
print(f" Manifest entries: {len(manifest.get('files', {}))}")
|
||||
@@ -404,21 +387,139 @@ def run_rebuild():
|
||||
files = manifest.get("files", {})
|
||||
sorted_files = sorted(files.items(), key=lambda x: x[1].get("modified", ""), reverse=True)
|
||||
|
||||
for rel_path, info in sorted_files:
|
||||
slug = info.get("slug", build_slug_from_path(rel_path))
|
||||
source_md_path = WIKI_DIR / "sources" / f"{slug}.md"
|
||||
if source_md_path.exists():
|
||||
title = source_md_path.read_text(encoding="utf-8").split("\n")[0].lstrip("# ").strip()
|
||||
index_lines.append(f"- [{title}](sources/{slug}.md)\n")
|
||||
else:
|
||||
index_lines.append(f"- [{slug}](sources/{slug}.md) — (source missing)\n")
|
||||
import re
|
||||
|
||||
index_lines.append("\n## Entities\n\n## Concepts\n\n## Syntheses\n")
|
||||
sources_dir = WIKI_DIR / "sources"
|
||||
|
||||
def normalize(s: str) -> str:
|
||||
# 用于不严格匹配文件名:移除非字母数字并小写
|
||||
return ''.join(ch for ch in s.lower() if ch.isalnum())
|
||||
|
||||
def find_source_file(slug: str, info: dict, rel_path: str):
|
||||
# 尝试按 manifest.source_path 优先匹配
|
||||
sp = info.get('source_path')
|
||||
if sp:
|
||||
p = REPO_ROOT / sp
|
||||
if p.exists():
|
||||
return p
|
||||
# 如果是相对于 wiki 的路径(如 "sources/foo.md"),尝试 WIKI_DIR 下
|
||||
p2 = WIKI_DIR / sp
|
||||
if p2.exists():
|
||||
return p2
|
||||
|
||||
# 常规位置:wiki/sources/<slug>.md
|
||||
candidate = sources_dir / f"{slug}.md"
|
||||
if candidate.exists():
|
||||
return candidate
|
||||
|
||||
# 尝试去除多余后缀(如 manifest 中误带了 ".md")
|
||||
if slug.endswith('.md'):
|
||||
short = slug[:-3]
|
||||
c2 = sources_dir / f"{short}.md"
|
||||
if c2.exists():
|
||||
return c2
|
||||
|
||||
# 不区分大小写或归一化匹配
|
||||
norm_slug = normalize(slug)
|
||||
if sources_dir.exists():
|
||||
for p in sources_dir.glob('*.md'):
|
||||
if p.stem.lower() == slug.lower():
|
||||
return p
|
||||
if normalize(p.stem) == norm_slug:
|
||||
return p
|
||||
|
||||
# 最后尝试根据 manifest 中的 rel_path(原始 raw 文件)去推测 source 文件名
|
||||
# 有些仓库会把源文件直接放在 wiki/sources 下并采用不同的 slug 规则
|
||||
try:
|
||||
# rel_path 示例: 'raw/dir/name.md' -> use name as candidate
|
||||
name = Path(rel_path).stem
|
||||
p3 = sources_dir / f"{name}.md"
|
||||
if p3.exists():
|
||||
return p3
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
for rel_path, info in sorted_files:
|
||||
slug = info.get("slug") or build_slug_from_path(rel_path)
|
||||
# 清理误带后缀
|
||||
if slug.endswith('.md'):
|
||||
slug = slug[:-3]
|
||||
|
||||
src_file = find_source_file(slug, info, rel_path)
|
||||
|
||||
# 从 manifest 的 modified 字段提取日期前缀(格式 YYYY-MM-DD)
|
||||
modified_raw = info.get("modified", "")
|
||||
date_prefix = ""
|
||||
if modified_raw:
|
||||
try:
|
||||
date_prefix = f"[{modified_raw[:10]}] "
|
||||
except Exception:
|
||||
date_prefix = ""
|
||||
|
||||
title = None
|
||||
if src_file and src_file.exists():
|
||||
content = src_file.read_text(encoding="utf-8")
|
||||
lines = content.splitlines()
|
||||
|
||||
# 处理 YAML frontmatter(容错:若缺少结束 '---' 则忽略 frontmatter)
|
||||
if lines and lines[0].strip() == '---':
|
||||
end_idx = None
|
||||
for i in range(1, min(len(lines), 500)):
|
||||
if lines[i].strip() == '---':
|
||||
end_idx = i
|
||||
break
|
||||
if end_idx:
|
||||
frontmatter = '\n'.join(lines[1:end_idx])
|
||||
# 支持 title: "..." 或 title: > 的情况(简单提取首行)
|
||||
m = re.search(r'^\s*title\s*:\s*(?:["\']?(.*?)["\']?|>\s*\n\s*(.*))\s*$', frontmatter, flags=re.MULTILINE)
|
||||
if m:
|
||||
title = (m.group(1) or m.group(2) or '').strip()
|
||||
|
||||
# 回退:第一个以 # 开头的行
|
||||
if not title and lines:
|
||||
for line in lines:
|
||||
s = line.strip()
|
||||
if s.startswith('#'):
|
||||
title = s.lstrip('#').strip()
|
||||
break
|
||||
|
||||
if not title:
|
||||
title = slug
|
||||
|
||||
index_lines.append(f"- {date_prefix}[{title}](sources/{src_file.name})\n")
|
||||
else:
|
||||
# 如果没有找到 source 文件,但 manifest 里有 source_path 文本,则将其展示出来,便于排查
|
||||
sp = info.get('source_path')
|
||||
if sp:
|
||||
index_lines.append(f"- {date_prefix}[{slug}](sources/{slug}.md) — (expected: {sp} — source missing)\n")
|
||||
else:
|
||||
index_lines.append(f"- {date_prefix}[{slug}](sources/{slug}.md) — (source missing)\n")
|
||||
|
||||
# Entities 索引
|
||||
index_lines.append("\n## Entities\n")
|
||||
entities_dir = WIKI_DIR / "entities"
|
||||
if entities_dir.exists():
|
||||
entity_files = sorted(entities_dir.glob("*.md"), key=lambda p: p.stem.lower())
|
||||
for ef in entity_files:
|
||||
index_lines.append(f"- [{ef.stem}](entities/{ef.name})\n")
|
||||
|
||||
# Concepts 索引
|
||||
index_lines.append("\n## Concepts\n")
|
||||
concepts_dir = WIKI_DIR / "concepts"
|
||||
if concepts_dir.exists():
|
||||
concept_files = sorted(concepts_dir.glob("*.md"), key=lambda p: p.stem.lower())
|
||||
for cf in concept_files:
|
||||
index_lines.append(f"- [{cf.stem}](concepts/{cf.name})\n")
|
||||
|
||||
index_lines.append("\n## Syntheses\n")
|
||||
|
||||
index_file = WIKI_DIR / "index.md"
|
||||
index_file.write_text("".join(index_lines), encoding="utf-8")
|
||||
print(f" {green('✓')} index.md rebuilt with {len(sorted_files)} sources")
|
||||
|
||||
# orphan 检测使用 manifest(重建后也可根据最新 manifest 检测)
|
||||
orphan_entities, orphan_concepts = find_orphan_entity_concept(manifest)
|
||||
if orphan_entities:
|
||||
print(f" {dim('?')} Orphan entities: {len(orphan_entities)}")
|
||||
@@ -428,6 +529,55 @@ def run_rebuild():
|
||||
print(f"\nDone.")
|
||||
|
||||
|
||||
# ─── 管理接口:mark_ingested(供摄取流程调用) ─────────────────────────────────────────
|
||||
|
||||
def mark_ingested(rel_path: str, slug: str = None, source_path: str = None, recalc_hash: bool = True, json_mode: bool = False):
|
||||
"""标记某个 raw 文件为已摄取(更新 manifest 条目)。
|
||||
|
||||
参数:
|
||||
rel_path: 相对于仓库根目录的路径,例如 "raw/dir/name.md" (必填)
|
||||
slug: 可选的 wiki slug(例如 "my-article");如果传入会设置 entry["slug"]
|
||||
source_path: 可选的 wiki 来源路径(例如 "wiki/sources/my-article.md")
|
||||
recalc_hash: 如果为 True,会基于当前文件重新计算 hash/modified
|
||||
json_mode: 如果为 True,输出为单行 JSON,便于脚本消费
|
||||
"""
|
||||
manifest = load_manifest()
|
||||
files = manifest.setdefault("files", {})
|
||||
entry = files.get(rel_path, {})
|
||||
|
||||
abs_path = REPO_ROOT / rel_path
|
||||
if recalc_hash and abs_path.exists():
|
||||
entry["hash"] = sha256_file(abs_path)
|
||||
entry["modified"] = datetime.fromtimestamp(abs_path.stat().st_mtime, tz=timezone.utc).isoformat()
|
||||
|
||||
if slug:
|
||||
entry["slug"] = slug
|
||||
|
||||
if source_path:
|
||||
entry["source_path"] = source_path
|
||||
else:
|
||||
entry.setdefault("slug", build_slug_from_path(rel_path))
|
||||
entry.setdefault("source_path", f"wiki/sources/{entry.get('slug')}.md")
|
||||
|
||||
entry["ingested"] = True
|
||||
entry["ingested_at"] = iso_now()
|
||||
entry.pop("error", None)
|
||||
|
||||
files[rel_path] = entry
|
||||
save_manifest(manifest)
|
||||
|
||||
if json_mode:
|
||||
print(json.dumps({
|
||||
"event": "mark_ingested",
|
||||
"rel_path": rel_path,
|
||||
"slug": entry.get("slug"),
|
||||
"source_path": entry.get("source_path"),
|
||||
"ingested_at": entry.get("ingested_at"),
|
||||
}))
|
||||
else:
|
||||
print(f"Marked ingested: {rel_path} -> {entry.get('source_path')}")
|
||||
|
||||
|
||||
# ─── CLI 入口 ───────────────────────────────────────────────
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -470,16 +620,43 @@ if __name__ == "__main__":
|
||||
action="store_true",
|
||||
help="JSON 行输出模式(供调用方解析)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mark-ingested",
|
||||
metavar=("REL_PATH"),
|
||||
nargs=1,
|
||||
help="标记单个 raw 文件为已摄取:传入相对路径(例如 'raw/dir/file.md')。",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--slug",
|
||||
help="与 --mark-ingested 配合:指定生成的 wiki slug(例如 my-article)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--source-path",
|
||||
help="与 --mark-ingested 配合:指定 wiki source 路径(例如 wiki/sources/my-article.md)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-recalc-hash",
|
||||
action="store_true",
|
||||
help="与 --mark-ingested 配合:不要重新计算文件 hash/modified(默认会重新计算)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mark-json",
|
||||
action="store_true",
|
||||
help="与 --mark-ingested 配合:以 JSON 单行输出 mark 结果",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=None,
|
||||
help="与 --pending --json 配合使用:限制返回的条目数(1 返回单条,>1 返回多条)。默认不限制(返回全部)。",
|
||||
help="与 --pending --json 配合:限制返回条目数(默认返回全部)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.rebuild:
|
||||
if args.mark_ingested:
|
||||
rel = args.mark_ingested[0]
|
||||
mark_ingested(rel, slug=args.slug, source_path=args.source_path, recalc_hash=not args.no_recalc_hash, json_mode=args.mark_json)
|
||||
elif args.rebuild:
|
||||
run_rebuild()
|
||||
elif args.pending:
|
||||
manifest = load_manifest()
|
||||
|
||||
Reference in New Issue
Block a user