新增 mark-ingested方法

This commit is contained in:
2026-04-21 17:24:35 +08:00
parent 8cba485187
commit cbba7a4923
2 changed files with 551 additions and 395 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -529,53 +529,201 @@ def run_rebuild():
print(f"\nDone.") print(f"\nDone.")
# ─── 管理接口:mark_ingested供摄取流程调用 ───────────────────────────────────────── # ─── 管理接口:reslug批量规范化 manifest slug ──────────────────────────────────────
def mark_ingested(rel_path: str, slug: str = None, source_path: str = None, recalc_hash: bool = True, json_mode: bool = False): def _compute_normalized_slug(rel_path: str) -> str:
"""标记某个 raw 文件为已摄取(更新 manifest 条目) """根据规则从 raw 文件路径计算规范化 slug
参数: 规则:
rel_path: 相对于仓库根目录的路径,例如 "raw/dir/name.md" (必填 a. 中文字符直接保留(不转拼音
slug: 可选的 wiki slug例如 "my-article");如果传入会设置 entry["slug"] b. ASCII 大写字母转小写
source_path: 可选的 wiki 来源路径(例如 "wiki/sources/my-article.md" c. 空格和特殊字符(引号、斜杠、问号、冒号、逗号、句号、感叹号、括号、
recalc_hash: 如果为 True会基于当前文件重新计算 hash/modified 全角符号等)替换为 `-`
json_mode: 如果为 True输出为单行 JSON便于脚本消费 d. 连续多个 `-` 压缩为单个 `-`,并去除首尾 `-`
"""
import re
stem = Path(rel_path).stem
# 转小写(仅影响 ASCII 字母,中文不变)
result = stem.lower()
# 将特殊字符替换为 `-`
# 保留中文字符、ASCII 字母数字、点(在版本号如 0.65.0 中保留)、下划线
result = re.sub(
r'[ \t\r\n'
r'\'"' # 单双引号
r'/\\\\' # 斜杠(全角/半角/反斜杠)
r'?' # 问号
r':' # 冒号
r',' # 逗号
r'\.' # 句号(保留版本号小数点后面会被压缩)
r'!' # 感叹号
r'()' # 括号
r'【】\[\]' # 方括号
r'《》<>' # 书名号/尖括号
r'' # 顿号
r'—–\-' # 破折号/连字符(统一重新处理)
r'|&@#%\^*+=~`'
r';' # 分号
r']+',
'-',
result,
)
# 压缩连续 `-` 为单个
result = re.sub(r'-{2,}', '-', result)
# 去除首尾 `-`
result = result.strip('-')
return result or 'untitled'
def run_reslug(target_rel_path: str = None, dry_run: bool = False):
"""批量(或单条)规范化 manifest 中的 slug / source_path。
参数:
target_rel_path: 指定单个 raw 相对路径;为 None 则处理全部条目。
dry_run: 若为 True只打印预览不写入 manifest。
""" """
manifest = load_manifest() manifest = load_manifest()
files = manifest.setdefault("files", {}) files = manifest.get("files", {})
entry = files.get(rel_path, {})
if target_rel_path:
targets = [(target_rel_path, files[target_rel_path])] if target_rel_path in files else []
if not targets:
print(red(f" ✗ Not found in manifest: {target_rel_path}"))
return
else:
targets = list(files.items())
changed = []
skipped = 0
for rel_path, info in targets:
new_slug = _compute_normalized_slug(rel_path)
old_slug = info.get("slug", "")
new_source_path = f"wiki/sources/{new_slug}.md"
old_source_path = info.get("source_path", "")
if new_slug == old_slug and new_source_path == old_source_path:
skipped += 1
continue
changed.append({
"rel_path": rel_path,
"old_slug": old_slug,
"new_slug": new_slug,
"old_source_path": old_source_path,
"new_source_path": new_source_path,
})
print(f"\n{bold('=== Reslug Preview' if dry_run else '=== Reslug')}\n")
print(f" Total entries scanned : {len(targets)}")
print(f" Unchanged (skipped) : {skipped}")
print(f" To update : {len(changed)}\n")
if not changed:
print(f" {green('')} All slugs already normalized.\n")
return
for item in changed:
print(f" {dim(item['rel_path'])}")
if item['old_slug'] != item['new_slug']:
print(f" slug : {yellow(item['old_slug'])}{green(item['new_slug'])}")
if item['old_source_path'] != item['new_source_path']:
print(f" src : {yellow(item['old_source_path'])}{green(item['new_source_path'])}")
print()
if dry_run:
print(f" {yellow('')} Dry-run — manifest NOT updated. Re-run without --dry-run to apply.\n")
return
# 应用变更
for item in changed:
entry = files[item["rel_path"]]
entry["slug"] = item["new_slug"]
entry["source_path"] = item["new_source_path"]
save_manifest(manifest)
print(f" {green('')} manifest.json updated ({len(changed)} entries changed).\n")
# ─── 管理接口mark_ingested供摄取流程调用 ─────────────────────────────────────────
def mark_ingested(rel_path: str, slug: str, json_mode: bool = False):
"""标记某个 raw 文件为已摄取(更新 manifest 条目)。
行为:
- rel_path 必须已存在于 manifest即曾被 --sync 扫描过),否则报错退出。
- slug 必须显式传入,否则报错退出。
- source_path 由 slug 自动推断为 wiki/sources/<slug>.md。
- modified 强制更新为 raw 文件的实际 mtime文件不存在时保留旧值并警告
- ingested 设为 Trueingested_at 设为当前 UTC 时间戳。
参数:
rel_path : 相对于仓库根目录的路径,例如 "raw/dir/name.md" (必填)
slug : wiki slug例如 "my-article" (必填)
json_mode : 若为 True输出单行 JSON便于脚本消费
"""
if not slug or not slug.strip():
msg = f"--slug is required for --mark-ingested"
if json_mode:
print(json.dumps({"event": "error", "message": msg}))
else:
print(red(f"{msg}"))
raise SystemExit(1)
manifest = load_manifest()
files = manifest.get("files", {})
if rel_path not in files:
msg = f"rel_path not found in manifest (run --sync first): {rel_path}"
if json_mode:
print(json.dumps({"event": "error", "message": msg}))
else:
print(red(f"{msg}"))
raise SystemExit(1)
entry = files[rel_path]
# 更新 slug 和 source_path
entry["slug"] = slug.strip()
entry["source_path"] = f"wiki/sources/{slug.strip()}.md"
# 强制更新 modified基于 raw 文件实际 mtime
abs_path = REPO_ROOT / rel_path abs_path = REPO_ROOT / rel_path
if recalc_hash and abs_path.exists(): if abs_path.exists():
entry["hash"] = sha256_file(abs_path) entry["hash"] = sha256_file(abs_path)
entry["modified"] = datetime.fromtimestamp(abs_path.stat().st_mtime, tz=timezone.utc).isoformat() entry["modified"] = datetime.fromtimestamp(abs_path.stat().st_mtime, tz=timezone.utc).isoformat()
if slug:
entry["slug"] = slug
if source_path:
entry["source_path"] = source_path
else: else:
entry.setdefault("slug", build_slug_from_path(rel_path)) if not json_mode:
entry.setdefault("source_path", f"wiki/sources/{entry.get('slug')}.md") print(yellow(f" ⚠ Raw file not found, modified timestamp not updated: {rel_path}"))
# 标记已摄取
entry["ingested"] = True entry["ingested"] = True
entry["ingested_at"] = iso_now() entry["ingested_at"] = iso_now()
entry.pop("error", None) entry.pop("error", None)
files[rel_path] = entry files[rel_path] = entry
manifest["files"] = files
save_manifest(manifest) save_manifest(manifest)
if json_mode: if json_mode:
print(json.dumps({ print(json.dumps({
"event": "mark_ingested", "event": "mark_ingested",
"rel_path": rel_path, "rel_path": rel_path,
"slug": entry.get("slug"), "slug": entry["slug"],
"source_path": entry.get("source_path"), "source_path": entry["source_path"],
"ingested_at": entry.get("ingested_at"), "modified": entry.get("modified"),
"ingested_at": entry["ingested_at"],
})) }))
else: else:
print(f"Marked ingested: {rel_path} -> {entry.get('source_path')}") print(f" {green('')} Marked ingested: {rel_path}")
print(f" slug : {entry['slug']}")
print(f" source_path : {entry['source_path']}")
print(f" modified : {entry.get('modified', '(unchanged)')}")
print(f" ingested_at : {entry['ingested_at']}")
# ─── CLI 入口 ─────────────────────────────────────────────── # ─── CLI 入口 ───────────────────────────────────────────────
@@ -622,22 +770,13 @@ if __name__ == "__main__":
) )
parser.add_argument( parser.add_argument(
"--mark-ingested", "--mark-ingested",
metavar=("REL_PATH"), metavar="REL_PATH",
nargs=1, nargs=1,
help="标记单个 raw 文件为已摄取:传入相对路径(例如 'raw/dir/file.md')。", help="标记单个 raw 文件为已摄取:传入相对路径(例如 'raw/dir/file.md')。必须配合 --slug 使用。",
) )
parser.add_argument( parser.add_argument(
"--slug", "--slug",
help="与 --mark-ingested 配合:指定生成的 wiki slug例如 my-article", help="与 --mark-ingested 配合(必填):指定 wiki slug例如 my-article",
)
parser.add_argument(
"--source-path",
help="与 --mark-ingested 配合:指定 wiki source 路径(例如 wiki/sources/my-article.md",
)
parser.add_argument(
"--no-recalc-hash",
action="store_true",
help="与 --mark-ingested 配合:不要重新计算文件 hash/modified默认会重新计算",
) )
parser.add_argument( parser.add_argument(
"--mark-json", "--mark-json",
@@ -650,12 +789,29 @@ if __name__ == "__main__":
default=None, default=None,
help="与 --pending --json 配合:限制返回条目数(默认返回全部)", help="与 --pending --json 配合:限制返回条目数(默认返回全部)",
) )
parser.add_argument(
"--reslug",
action="store_true",
help="批量规范化 manifest 中的 slug/source_path中文保留ASCII 特殊字符转 -,大写转小写,压缩连续 -",
)
parser.add_argument(
"--reslug-target",
metavar="REL_PATH",
help="与 --reslug 配合:只处理指定的 raw 文件(例如 'raw/dir/file.md'",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="与 --reslug 配合:只预览变更,不写入 manifest",
)
args = parser.parse_args() args = parser.parse_args()
if args.mark_ingested: if args.mark_ingested:
rel = args.mark_ingested[0] rel = args.mark_ingested[0]
mark_ingested(rel, slug=args.slug, source_path=args.source_path, recalc_hash=not args.no_recalc_hash, json_mode=args.mark_json) mark_ingested(rel, slug=args.slug, json_mode=args.mark_json)
elif args.reslug:
run_reslug(target_rel_path=args.reslug_target, dry_run=args.dry_run)
elif args.rebuild: elif args.rebuild:
run_rebuild() run_rebuild()
elif args.pending: elif args.pending: