新增 mark-ingested方法
This commit is contained in:
230
tools/sync.py
230
tools/sync.py
@@ -529,53 +529,201 @@ def run_rebuild():
|
||||
print(f"\nDone.")
|
||||
|
||||
|
||||
# ─── 管理接口:mark_ingested(供摄取流程调用) ─────────────────────────────────────────
|
||||
# ─── 管理接口:reslug(批量规范化 manifest slug) ──────────────────────────────────────
|
||||
|
||||
def mark_ingested(rel_path: str, slug: str = None, source_path: str = None, recalc_hash: bool = True, json_mode: bool = False):
|
||||
"""标记某个 raw 文件为已摄取(更新 manifest 条目)。
|
||||
def _compute_normalized_slug(rel_path: str) -> str:
|
||||
"""根据规则从 raw 文件路径计算规范化 slug。
|
||||
|
||||
参数:
|
||||
rel_path: 相对于仓库根目录的路径,例如 "raw/dir/name.md" (必填)
|
||||
slug: 可选的 wiki slug(例如 "my-article");如果传入会设置 entry["slug"]
|
||||
source_path: 可选的 wiki 来源路径(例如 "wiki/sources/my-article.md")
|
||||
recalc_hash: 如果为 True,会基于当前文件重新计算 hash/modified
|
||||
json_mode: 如果为 True,输出为单行 JSON,便于脚本消费
|
||||
规则:
|
||||
a. 中文字符直接保留(不转拼音)
|
||||
b. ASCII 大写字母转小写
|
||||
c. 空格和特殊字符(引号、斜杠、问号、冒号、逗号、句号、感叹号、括号、
|
||||
全角符号等)替换为 `-`
|
||||
d. 连续多个 `-` 压缩为单个 `-`,并去除首尾 `-`
|
||||
"""
|
||||
import re
|
||||
stem = Path(rel_path).stem
|
||||
|
||||
# 转小写(仅影响 ASCII 字母,中文不变)
|
||||
result = stem.lower()
|
||||
|
||||
# 将特殊字符替换为 `-`
|
||||
# 保留:中文字符、ASCII 字母数字、点(在版本号如 0.65.0 中保留)、下划线
|
||||
result = re.sub(
|
||||
r'[ \t\r\n'
|
||||
r'\'"' # 单双引号
|
||||
r'//\\\\' # 斜杠(全角/半角/反斜杠)
|
||||
r'??' # 问号
|
||||
r'::' # 冒号
|
||||
r',,' # 逗号
|
||||
r'。\.' # 句号(保留版本号小数点后面会被压缩)
|
||||
r'!!' # 感叹号
|
||||
r'()()' # 括号
|
||||
r'【】\[\]' # 方括号
|
||||
r'《》<>' # 书名号/尖括号
|
||||
r'、' # 顿号
|
||||
r'—–\-' # 破折号/连字符(统一重新处理)
|
||||
r'|&@#%\^*+=~`'
|
||||
r';;' # 分号
|
||||
r']+',
|
||||
'-',
|
||||
result,
|
||||
)
|
||||
|
||||
# 压缩连续 `-` 为单个
|
||||
result = re.sub(r'-{2,}', '-', result)
|
||||
|
||||
# 去除首尾 `-`
|
||||
result = result.strip('-')
|
||||
|
||||
return result or 'untitled'
|
||||
|
||||
|
||||
def run_reslug(target_rel_path: str = None, dry_run: bool = False):
|
||||
"""批量(或单条)规范化 manifest 中的 slug / source_path。
|
||||
|
||||
参数:
|
||||
target_rel_path: 指定单个 raw 相对路径;为 None 则处理全部条目。
|
||||
dry_run: 若为 True,只打印预览,不写入 manifest。
|
||||
"""
|
||||
manifest = load_manifest()
|
||||
files = manifest.setdefault("files", {})
|
||||
entry = files.get(rel_path, {})
|
||||
files = manifest.get("files", {})
|
||||
|
||||
if target_rel_path:
|
||||
targets = [(target_rel_path, files[target_rel_path])] if target_rel_path in files else []
|
||||
if not targets:
|
||||
print(red(f" ✗ Not found in manifest: {target_rel_path}"))
|
||||
return
|
||||
else:
|
||||
targets = list(files.items())
|
||||
|
||||
changed = []
|
||||
skipped = 0
|
||||
|
||||
for rel_path, info in targets:
|
||||
new_slug = _compute_normalized_slug(rel_path)
|
||||
old_slug = info.get("slug", "")
|
||||
new_source_path = f"wiki/sources/{new_slug}.md"
|
||||
old_source_path = info.get("source_path", "")
|
||||
|
||||
if new_slug == old_slug and new_source_path == old_source_path:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
changed.append({
|
||||
"rel_path": rel_path,
|
||||
"old_slug": old_slug,
|
||||
"new_slug": new_slug,
|
||||
"old_source_path": old_source_path,
|
||||
"new_source_path": new_source_path,
|
||||
})
|
||||
|
||||
print(f"\n{bold('=== Reslug Preview' if dry_run else '=== Reslug')}\n")
|
||||
print(f" Total entries scanned : {len(targets)}")
|
||||
print(f" Unchanged (skipped) : {skipped}")
|
||||
print(f" To update : {len(changed)}\n")
|
||||
|
||||
if not changed:
|
||||
print(f" {green('✓')} All slugs already normalized.\n")
|
||||
return
|
||||
|
||||
for item in changed:
|
||||
print(f" {dim(item['rel_path'])}")
|
||||
if item['old_slug'] != item['new_slug']:
|
||||
print(f" slug : {yellow(item['old_slug'])} → {green(item['new_slug'])}")
|
||||
if item['old_source_path'] != item['new_source_path']:
|
||||
print(f" src : {yellow(item['old_source_path'])} → {green(item['new_source_path'])}")
|
||||
print()
|
||||
|
||||
if dry_run:
|
||||
print(f" {yellow('⚠')} Dry-run — manifest NOT updated. Re-run without --dry-run to apply.\n")
|
||||
return
|
||||
|
||||
# 应用变更
|
||||
for item in changed:
|
||||
entry = files[item["rel_path"]]
|
||||
entry["slug"] = item["new_slug"]
|
||||
entry["source_path"] = item["new_source_path"]
|
||||
|
||||
save_manifest(manifest)
|
||||
print(f" {green('✓')} manifest.json updated ({len(changed)} entries changed).\n")
|
||||
|
||||
|
||||
# ─── 管理接口:mark_ingested(供摄取流程调用) ─────────────────────────────────────────
|
||||
|
||||
def mark_ingested(rel_path: str, slug: str, json_mode: bool = False):
|
||||
"""标记某个 raw 文件为已摄取(更新 manifest 条目)。
|
||||
|
||||
行为:
|
||||
- rel_path 必须已存在于 manifest(即曾被 --sync 扫描过),否则报错退出。
|
||||
- slug 必须显式传入,否则报错退出。
|
||||
- source_path 由 slug 自动推断为 wiki/sources/<slug>.md。
|
||||
- modified 强制更新为 raw 文件的实际 mtime(文件不存在时保留旧值并警告)。
|
||||
- ingested 设为 True,ingested_at 设为当前 UTC 时间戳。
|
||||
|
||||
参数:
|
||||
rel_path : 相对于仓库根目录的路径,例如 "raw/dir/name.md" (必填)
|
||||
slug : wiki slug,例如 "my-article" (必填)
|
||||
json_mode : 若为 True,输出单行 JSON,便于脚本消费
|
||||
"""
|
||||
if not slug or not slug.strip():
|
||||
msg = f"--slug is required for --mark-ingested"
|
||||
if json_mode:
|
||||
print(json.dumps({"event": "error", "message": msg}))
|
||||
else:
|
||||
print(red(f" ✗ {msg}"))
|
||||
raise SystemExit(1)
|
||||
|
||||
manifest = load_manifest()
|
||||
files = manifest.get("files", {})
|
||||
|
||||
if rel_path not in files:
|
||||
msg = f"rel_path not found in manifest (run --sync first): {rel_path}"
|
||||
if json_mode:
|
||||
print(json.dumps({"event": "error", "message": msg}))
|
||||
else:
|
||||
print(red(f" ✗ {msg}"))
|
||||
raise SystemExit(1)
|
||||
|
||||
entry = files[rel_path]
|
||||
|
||||
# 更新 slug 和 source_path
|
||||
entry["slug"] = slug.strip()
|
||||
entry["source_path"] = f"wiki/sources/{slug.strip()}.md"
|
||||
|
||||
# 强制更新 modified(基于 raw 文件实际 mtime)
|
||||
abs_path = REPO_ROOT / rel_path
|
||||
if recalc_hash and abs_path.exists():
|
||||
if abs_path.exists():
|
||||
entry["hash"] = sha256_file(abs_path)
|
||||
entry["modified"] = datetime.fromtimestamp(abs_path.stat().st_mtime, tz=timezone.utc).isoformat()
|
||||
|
||||
if slug:
|
||||
entry["slug"] = slug
|
||||
|
||||
if source_path:
|
||||
entry["source_path"] = source_path
|
||||
else:
|
||||
entry.setdefault("slug", build_slug_from_path(rel_path))
|
||||
entry.setdefault("source_path", f"wiki/sources/{entry.get('slug')}.md")
|
||||
if not json_mode:
|
||||
print(yellow(f" ⚠ Raw file not found, modified timestamp not updated: {rel_path}"))
|
||||
|
||||
# 标记已摄取
|
||||
entry["ingested"] = True
|
||||
entry["ingested_at"] = iso_now()
|
||||
entry.pop("error", None)
|
||||
|
||||
files[rel_path] = entry
|
||||
manifest["files"] = files
|
||||
save_manifest(manifest)
|
||||
|
||||
if json_mode:
|
||||
print(json.dumps({
|
||||
"event": "mark_ingested",
|
||||
"rel_path": rel_path,
|
||||
"slug": entry.get("slug"),
|
||||
"source_path": entry.get("source_path"),
|
||||
"ingested_at": entry.get("ingested_at"),
|
||||
"slug": entry["slug"],
|
||||
"source_path": entry["source_path"],
|
||||
"modified": entry.get("modified"),
|
||||
"ingested_at": entry["ingested_at"],
|
||||
}))
|
||||
else:
|
||||
print(f"Marked ingested: {rel_path} -> {entry.get('source_path')}")
|
||||
print(f" {green('✓')} Marked ingested: {rel_path}")
|
||||
print(f" slug : {entry['slug']}")
|
||||
print(f" source_path : {entry['source_path']}")
|
||||
print(f" modified : {entry.get('modified', '(unchanged)')}")
|
||||
print(f" ingested_at : {entry['ingested_at']}")
|
||||
|
||||
|
||||
# ─── CLI 入口 ───────────────────────────────────────────────
|
||||
@@ -622,22 +770,13 @@ if __name__ == "__main__":
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mark-ingested",
|
||||
metavar=("REL_PATH"),
|
||||
metavar="REL_PATH",
|
||||
nargs=1,
|
||||
help="标记单个 raw 文件为已摄取:传入相对路径(例如 'raw/dir/file.md')。",
|
||||
help="标记单个 raw 文件为已摄取:传入相对路径(例如 'raw/dir/file.md')。必须配合 --slug 使用。",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--slug",
|
||||
help="与 --mark-ingested 配合:指定生成的 wiki slug(例如 my-article)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--source-path",
|
||||
help="与 --mark-ingested 配合:指定 wiki source 路径(例如 wiki/sources/my-article.md)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-recalc-hash",
|
||||
action="store_true",
|
||||
help="与 --mark-ingested 配合:不要重新计算文件 hash/modified(默认会重新计算)",
|
||||
help="与 --mark-ingested 配合(必填):指定 wiki slug(例如 my-article)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mark-json",
|
||||
@@ -650,12 +789,29 @@ if __name__ == "__main__":
|
||||
default=None,
|
||||
help="与 --pending --json 配合:限制返回条目数(默认返回全部)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reslug",
|
||||
action="store_true",
|
||||
help="批量规范化 manifest 中的 slug/source_path(中文保留,ASCII 特殊字符转 -,大写转小写,压缩连续 -)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reslug-target",
|
||||
metavar="REL_PATH",
|
||||
help="与 --reslug 配合:只处理指定的 raw 文件(例如 'raw/dir/file.md')",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="与 --reslug 配合:只预览变更,不写入 manifest",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.mark_ingested:
|
||||
rel = args.mark_ingested[0]
|
||||
mark_ingested(rel, slug=args.slug, source_path=args.source_path, recalc_hash=not args.no_recalc_hash, json_mode=args.mark_json)
|
||||
mark_ingested(rel, slug=args.slug, json_mode=args.mark_json)
|
||||
elif args.reslug:
|
||||
run_reslug(target_rel_path=args.reslug_target, dry_run=args.dry_run)
|
||||
elif args.rebuild:
|
||||
run_rebuild()
|
||||
elif args.pending:
|
||||
|
||||
Reference in New Issue
Block a user