fix rebuild

This commit is contained in:
2026-04-21 16:15:58 +08:00
parent a31d28a386
commit 8cba485187
2 changed files with 333 additions and 116 deletions

View File

@@ -195,84 +195,60 @@ def check_changes(manifest: dict, raw_files: dict) -> dict:
def run_sync(dry_run: bool = False, verbose: bool = False, json_mode: bool = False):
print(f"\n{bold('=== Wiki Sync')}\n", end="")
print(f" Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
print(f" Raw: {REPO_ROOT / 'raw'}\n")
print(f" Wiki: {WIKI_DIR}\n")
print(f" Mode: {'DRY-RUN (preview only)' if dry_run else 'LIVE SYNC'}")
"""执行同步并尽量保持输出精简。
- 默认(非 verbose、非 json只会输出一行变化摘要 + manifest 更新成功提示。
- verbose=True 会打印每个新增/更新/删除的文件列表(保留旧行为)。
- json_mode=True 保持原有的机器友好 JSON 流输出。
"""
manifest = load_manifest()
log("manifest.json loaded", "info")
raw_files = scan_raw()
log(f"raw/ scan: {len(raw_files)} .md files found", "info")
changes = check_changes(manifest, raw_files)
total_changes = len(changes["new"]) + len(changes["updated"]) + len(changes["deleted"])
new = changes["new"]
updated = changes["updated"]
deleted = changes["deleted"]
total_changes = len(new) + len(updated) + len(deleted)
if total_changes == 0:
log("No changes detected — wiki is up to date.", "success")
if json_mode:
print(json.dumps({"event": "sync_complete", "summary": {"pending": 0, "deleted": 0, "manifest_entries": len(manifest.get("files", {}))}}))
else:
log("No changes detected — wiki is up to date.", "success")
return
# 非 JSON简短摘要默认或详细列表verbose
if not json_mode:
print(f"\n{bold('--- Changes ---')}")
print(f" {green('+')} New: {len(changes['new'])}")
print(f" {yellow('~')} Updated: {len(changes['updated'])}")
print(f" {red('-')} Deleted: {len(changes['deleted'])}")
if verbose or not dry_run:
if changes["new"]:
if not json_mode:
print(f"\n {bold('New Files:')}")
for f in changes["new"]:
slug = build_slug_from_path(f["rel_path"])
if json_mode:
print(json.dumps({"event": "new_detected", "rel_path": f["rel_path"], "slug": slug}))
else:
log(f"{green('[+')} {f['rel_path']}", "normal")
if changes["updated"]:
if not json_mode:
print(f"\n {bold('Updated Files:')}")
for f in changes["updated"]:
slug = manifest["files"].get(f["rel_path"], {}).get("slug") or build_slug_from_path(f["rel_path"])
if json_mode:
print(json.dumps({"event": "updated_detected", "rel_path": f["rel_path"], "slug": slug}))
else:
log(f"{yellow('[~]')} {f['rel_path']} (hash changed)", "normal")
if changes["deleted"]:
if not json_mode:
print(f"\n {bold('Deleted Files:')}")
for f in changes["deleted"]:
if json_mode:
print(json.dumps({"event": "deleted_detected", "rel_path": f["rel_path"]}))
else:
log(f"{red('[-]')} {f['rel_path']}", "normal")
log(f"Changes detected: +{len(new)} ~{len(updated)} -{len(deleted)}", "info")
if verbose:
if new:
print("\nNew Files:")
for f in new:
print(f" {f['rel_path']}")
if updated:
print("\nUpdated Files:")
for f in updated:
old = f.get("old_hash")
print(f" {f['rel_path']}" + (f" (was {old})" if old else ""))
if deleted:
print("\nDeleted Files:")
for f in deleted:
print(f" {f['rel_path']}")
if dry_run:
log("\nDry-run complete. Run with --sync to apply.", "warn")
log("Dry-run complete. Run with --sync to apply.", "warn")
return
# ─── Apply Sync ───
if not json_mode:
print(f"\n{bold('--- Applying Sync ---')}")
# Apply changes (保持原有 manifest 更新逻辑,但抑制逐文件日志,除非 json_mode 或 verbose)
updated_manifest = manifest.copy()
updated_manifest["files"] = manifest.get("files", {}).copy()
pending_files = []
# ① 新增 → 加入 manifest
for f in changes["new"]:
for f in new:
rel_path = f["rel_path"]
slug = build_slug_from_path(rel_path)
if json_mode:
print(json.dumps({"event": "pending", "rel_path": rel_path, "slug": slug, "action": "new"}))
pending_files.append({"rel_path": rel_path, "abs_path": f["abs_path"], "slug": slug, "action": "new"})
updated_manifest["files"][rel_path] = {
"hash": f["hash"],
"modified": f.get("modified"),
@@ -282,17 +258,13 @@ def run_sync(dry_run: bool = False, verbose: bool = False, json_mode: bool = Fal
"ingested_at": None,
}
# ② 修改 → 更新 manifest
for f in changes["updated"]:
for f in updated:
rel_path = f["rel_path"]
old_entry = manifest["files"].get(rel_path, {})
slug = old_entry.get("slug") or build_slug_from_path(rel_path)
if json_mode:
print(json.dumps({"event": "pending", "rel_path": rel_path, "slug": slug, "action": "updated"}))
pending_files.append({"rel_path": rel_path, "abs_path": f["abs_path"], "slug": slug, "action": "updated"})
updated_manifest["files"][rel_path] = {
**old_entry,
"hash": f["hash"],
@@ -301,23 +273,17 @@ def run_sync(dry_run: bool = False, verbose: bool = False, json_mode: bool = Fal
"ingested_at": None,
}
# ③ 删除 → 保留 wiki 内容,从 manifest 移除
deleted_files = []
for f in changes["deleted"]:
for f in deleted:
rel_path = f["rel_path"]
source_path = f.get("source_path")
if not json_mode:
log(f"Deleted: {rel_path}", "warn")
if source_path:
sp = WIKI_DIR / source_path
log(f" Wiki source kept: {sp}", "warn")
if rel_path in updated_manifest["files"]:
del updated_manifest["files"][rel_path]
deleted_files.append(rel_path)
if json_mode and deleted:
print(json.dumps({"event": "deleted_detected", "rel_path": rel_path}))
# 保存 manifest
save_manifest(updated_manifest)
log(f"\nmanifest.json updated ({len(updated_manifest['files'])} entries)", "success")
if json_mode:
print(json.dumps({
@@ -330,26 +296,33 @@ def run_sync(dry_run: bool = False, verbose: bool = False, json_mode: bool = Fal
"pending_files": pending_files,
"deleted_files": deleted_files,
}))
else:
log(f"manifest.json updated ({len(updated_manifest['files'])} entries)", "success")
if verbose:
log(f"Pending files for ingestion: {len(pending_files)}", "info")
# Orphan detection
# 简短的 orphan 报告(仅在 verbose 模式下列出详情)
orphan_entities, orphan_concepts = find_orphan_entity_concept(updated_manifest)
if not json_mode:
if orphan_entities or orphan_concepts:
print(f"\n{bold('--- Orphan Report (kept as requested) ---')}")
if orphan_entities:
print(f" {bold('Orphan Entities')} ({len(orphan_entities)}):")
for e in sorted(orphan_entities):
print(f" {dim('?')} {e}")
if orphan_concepts:
print(f" {bold('Orphan Concepts')} ({len(orphan_concepts)}):")
for c in sorted(orphan_concepts):
print(f" {dim('?')} {c}")
log("\nOrphan pages are kept (not deleted per user request).", "info")
if verbose:
print(f"\n{bold('--- Orphan Report (kept as requested) ---')}")
if orphan_entities:
print(f"Orphan Entities ({len(orphan_entities)}):")
for e in sorted(orphan_entities):
print(f" {e}")
if orphan_concepts:
print(f"Orphan Concepts ({len(orphan_concepts)}):")
for c in sorted(orphan_concepts):
print(f" {c}")
else:
log(f"Orphan entities: {len(orphan_entities)}; Orphan concepts: {len(orphan_concepts)}", "info")
else:
log("No orphan entity/concept detected.", "success")
if verbose:
log("No orphan entity/concept detected.", "success")
print(f"\n{bold('Done.')}")
print(f"\n Pending files for ingestion: {len(pending_files)}")
if not json_mode:
print("\nDone.")
def run_check():
@@ -388,7 +361,17 @@ def run_check():
def run_rebuild():
"""从 manifest 重建 wiki/index.md兜底方案"""
"""从 manifest 重建 wiki/index.md兜底方案
改进点:
- 优先使用 manifest 中记录的 source_path如果存在且文件真实存在
其次尝试 wiki/sources/<slug>.md再尝试在 wiki/sources 下做不区分大小写或
归一化后的匹配(减少命名差异导致的断链)。
- 更健壮地解析 YAML frontmatter 中的 title 字段(支持缺失结束符的容错),
并在没有 title 时回退到第一个 Markdown 标题或 slug。
- 在无法找到 source 文件时,保留原 slug 并在 index 中标注 (source missing)
以便人工排查。
"""
manifest = load_manifest()
print(f"\n{bold('=== Wiki Rebuild from Manifest')}\n")
print(f" Manifest entries: {len(manifest.get('files', {}))}")
@@ -404,21 +387,139 @@ def run_rebuild():
files = manifest.get("files", {})
sorted_files = sorted(files.items(), key=lambda x: x[1].get("modified", ""), reverse=True)
for rel_path, info in sorted_files:
slug = info.get("slug", build_slug_from_path(rel_path))
source_md_path = WIKI_DIR / "sources" / f"{slug}.md"
if source_md_path.exists():
title = source_md_path.read_text(encoding="utf-8").split("\n")[0].lstrip("# ").strip()
index_lines.append(f"- [{title}](sources/{slug}.md)\n")
else:
index_lines.append(f"- [{slug}](sources/{slug}.md) — (source missing)\n")
import re
index_lines.append("\n## Entities\n\n## Concepts\n\n## Syntheses\n")
sources_dir = WIKI_DIR / "sources"
def normalize(s: str) -> str:
# 用于不严格匹配文件名:移除非字母数字并小写
return ''.join(ch for ch in s.lower() if ch.isalnum())
def find_source_file(slug: str, info: dict, rel_path: str):
# 尝试按 manifest.source_path 优先匹配
sp = info.get('source_path')
if sp:
p = REPO_ROOT / sp
if p.exists():
return p
# 如果是相对于 wiki 的路径(如 "sources/foo.md"),尝试 WIKI_DIR 下
p2 = WIKI_DIR / sp
if p2.exists():
return p2
# 常规位置wiki/sources/<slug>.md
candidate = sources_dir / f"{slug}.md"
if candidate.exists():
return candidate
# 尝试去除多余后缀(如 manifest 中误带了 ".md"
if slug.endswith('.md'):
short = slug[:-3]
c2 = sources_dir / f"{short}.md"
if c2.exists():
return c2
# 不区分大小写或归一化匹配
norm_slug = normalize(slug)
if sources_dir.exists():
for p in sources_dir.glob('*.md'):
if p.stem.lower() == slug.lower():
return p
if normalize(p.stem) == norm_slug:
return p
# 最后尝试根据 manifest 中的 rel_path原始 raw 文件)去推测 source 文件名
# 有些仓库会把源文件直接放在 wiki/sources 下并采用不同的 slug 规则
try:
# rel_path 示例: 'raw/dir/name.md' -> use name as candidate
name = Path(rel_path).stem
p3 = sources_dir / f"{name}.md"
if p3.exists():
return p3
except Exception:
pass
return None
for rel_path, info in sorted_files:
slug = info.get("slug") or build_slug_from_path(rel_path)
# 清理误带后缀
if slug.endswith('.md'):
slug = slug[:-3]
src_file = find_source_file(slug, info, rel_path)
# 从 manifest 的 modified 字段提取日期前缀(格式 YYYY-MM-DD
modified_raw = info.get("modified", "")
date_prefix = ""
if modified_raw:
try:
date_prefix = f"[{modified_raw[:10]}] "
except Exception:
date_prefix = ""
title = None
if src_file and src_file.exists():
content = src_file.read_text(encoding="utf-8")
lines = content.splitlines()
# 处理 YAML frontmatter容错若缺少结束 '---' 则忽略 frontmatter
if lines and lines[0].strip() == '---':
end_idx = None
for i in range(1, min(len(lines), 500)):
if lines[i].strip() == '---':
end_idx = i
break
if end_idx:
frontmatter = '\n'.join(lines[1:end_idx])
# 支持 title: "..." 或 title: > 的情况(简单提取首行)
m = re.search(r'^\s*title\s*:\s*(?:["\']?(.*?)["\']?|>\s*\n\s*(.*))\s*$', frontmatter, flags=re.MULTILINE)
if m:
title = (m.group(1) or m.group(2) or '').strip()
# 回退:第一个以 # 开头的行
if not title and lines:
for line in lines:
s = line.strip()
if s.startswith('#'):
title = s.lstrip('#').strip()
break
if not title:
title = slug
index_lines.append(f"- {date_prefix}[{title}](sources/{src_file.name})\n")
else:
# 如果没有找到 source 文件,但 manifest 里有 source_path 文本,则将其展示出来,便于排查
sp = info.get('source_path')
if sp:
index_lines.append(f"- {date_prefix}[{slug}](sources/{slug}.md) — (expected: {sp} — source missing)\n")
else:
index_lines.append(f"- {date_prefix}[{slug}](sources/{slug}.md) — (source missing)\n")
# Entities 索引
index_lines.append("\n## Entities\n")
entities_dir = WIKI_DIR / "entities"
if entities_dir.exists():
entity_files = sorted(entities_dir.glob("*.md"), key=lambda p: p.stem.lower())
for ef in entity_files:
index_lines.append(f"- [{ef.stem}](entities/{ef.name})\n")
# Concepts 索引
index_lines.append("\n## Concepts\n")
concepts_dir = WIKI_DIR / "concepts"
if concepts_dir.exists():
concept_files = sorted(concepts_dir.glob("*.md"), key=lambda p: p.stem.lower())
for cf in concept_files:
index_lines.append(f"- [{cf.stem}](concepts/{cf.name})\n")
index_lines.append("\n## Syntheses\n")
index_file = WIKI_DIR / "index.md"
index_file.write_text("".join(index_lines), encoding="utf-8")
print(f" {green('')} index.md rebuilt with {len(sorted_files)} sources")
# orphan 检测使用 manifest重建后也可根据最新 manifest 检测)
orphan_entities, orphan_concepts = find_orphan_entity_concept(manifest)
if orphan_entities:
print(f" {dim('?')} Orphan entities: {len(orphan_entities)}")
@@ -428,6 +529,55 @@ def run_rebuild():
print(f"\nDone.")
# ─── 管理接口mark_ingested供摄取流程调用 ─────────────────────────────────────────
def mark_ingested(rel_path: str, slug: str = None, source_path: str = None, recalc_hash: bool = True, json_mode: bool = False):
"""标记某个 raw 文件为已摄取(更新 manifest 条目)。
参数:
rel_path: 相对于仓库根目录的路径,例如 "raw/dir/name.md" (必填)
slug: 可选的 wiki slug例如 "my-article");如果传入会设置 entry["slug"]
source_path: 可选的 wiki 来源路径(例如 "wiki/sources/my-article.md"
recalc_hash: 如果为 True会基于当前文件重新计算 hash/modified
json_mode: 如果为 True输出为单行 JSON便于脚本消费
"""
manifest = load_manifest()
files = manifest.setdefault("files", {})
entry = files.get(rel_path, {})
abs_path = REPO_ROOT / rel_path
if recalc_hash and abs_path.exists():
entry["hash"] = sha256_file(abs_path)
entry["modified"] = datetime.fromtimestamp(abs_path.stat().st_mtime, tz=timezone.utc).isoformat()
if slug:
entry["slug"] = slug
if source_path:
entry["source_path"] = source_path
else:
entry.setdefault("slug", build_slug_from_path(rel_path))
entry.setdefault("source_path", f"wiki/sources/{entry.get('slug')}.md")
entry["ingested"] = True
entry["ingested_at"] = iso_now()
entry.pop("error", None)
files[rel_path] = entry
save_manifest(manifest)
if json_mode:
print(json.dumps({
"event": "mark_ingested",
"rel_path": rel_path,
"slug": entry.get("slug"),
"source_path": entry.get("source_path"),
"ingested_at": entry.get("ingested_at"),
}))
else:
print(f"Marked ingested: {rel_path} -> {entry.get('source_path')}")
# ─── CLI 入口 ───────────────────────────────────────────────
if __name__ == "__main__":
@@ -470,16 +620,43 @@ if __name__ == "__main__":
action="store_true",
help="JSON 行输出模式(供调用方解析)",
)
parser.add_argument(
"--mark-ingested",
metavar=("REL_PATH"),
nargs=1,
help="标记单个 raw 文件为已摄取:传入相对路径(例如 'raw/dir/file.md')。",
)
parser.add_argument(
"--slug",
help="与 --mark-ingested 配合:指定生成的 wiki slug例如 my-article",
)
parser.add_argument(
"--source-path",
help="与 --mark-ingested 配合:指定 wiki source 路径(例如 wiki/sources/my-article.md",
)
parser.add_argument(
"--no-recalc-hash",
action="store_true",
help="与 --mark-ingested 配合:不要重新计算文件 hash/modified默认会重新计算",
)
parser.add_argument(
"--mark-json",
action="store_true",
help="与 --mark-ingested 配合:以 JSON 单行输出 mark 结果",
)
parser.add_argument(
"--limit",
type=int,
default=None,
help="与 --pending --json 配合使用:限制返回条目数(1 返回单条,>1 返回多条)。默认不限制(返回全部)",
help="与 --pending --json 配合:限制返回条目数(默认返回全部)",
)
args = parser.parse_args()
if args.rebuild:
if args.mark_ingested:
rel = args.mark_ingested[0]
mark_ingested(rel, slug=args.slug, source_path=args.source_path, recalc_hash=not args.no_recalc_hash, json_mode=args.mark_json)
elif args.rebuild:
run_rebuild()
elif args.pending:
manifest = load_manifest()