Auto-sync

This commit is contained in:
2026-04-14 11:58:16 +08:00
parent abc9369d1f
commit be67293b60
20 changed files with 2246 additions and 9 deletions

454
tools/build_graph.py Normal file
View File

@@ -0,0 +1,454 @@
#!/usr/bin/env python3
"""
Build the knowledge graph from the wiki.
Usage:
python tools/build_graph.py # full rebuild
python tools/build_graph.py --no-infer # skip semantic inference (faster)
python tools/build_graph.py --open # open graph.html in browser after build
Outputs:
graph/graph.json — node/edge data (cached by SHA256)
graph/graph.html — interactive vis.js visualization
Edge types:
EXTRACTED — explicit [[wikilink]] in a page
INFERRED — Claude-detected implicit relationship
AMBIGUOUS — low-confidence inferred relationship
"""
import re
import json
import hashlib
import argparse
import webbrowser
from pathlib import Path
from datetime import date
import os
try:
import networkx as nx
from networkx.algorithms import community as nx_community
HAS_NETWORKX = True
except ImportError:
HAS_NETWORKX = False
print("Warning: networkx not installed. Community detection disabled. Run: pip install networkx")
REPO_ROOT = Path(__file__).parent.parent
WIKI_DIR = REPO_ROOT / "wiki"
GRAPH_DIR = REPO_ROOT / "graph"
GRAPH_JSON = GRAPH_DIR / "graph.json"
GRAPH_HTML = GRAPH_DIR / "graph.html"
CACHE_FILE = GRAPH_DIR / ".cache.json"
LOG_FILE = WIKI_DIR / "log.md"
SCHEMA_FILE = REPO_ROOT / "CLAUDE.md"
# Node type → color mapping
TYPE_COLORS = {
"source": "#4CAF50",
"entity": "#2196F3",
"concept": "#FF9800",
"synthesis": "#9C27B0",
"unknown": "#9E9E9E",
}
EDGE_COLORS = {
"EXTRACTED": "#555555",
"INFERRED": "#FF5722",
"AMBIGUOUS": "#BDBDBD",
}
def read_file(path: Path) -> str:
return path.read_text(encoding="utf-8") if path.exists() else ""
def call_llm(prompt: str, model_env: str, default_model: str, max_tokens: int = 4096) -> str:
try:
from litellm import completion
except ImportError:
print("Error: litellm not installed. Run: pip install litellm")
import sys
sys.exit(1)
model = os.getenv(model_env, default_model)
response = completion(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens
)
return response.choices[0].message.content
def sha256(text: str) -> str:
return hashlib.sha256(text.encode()).hexdigest()
def all_wiki_pages() -> list[Path]:
return [p for p in WIKI_DIR.rglob("*.md")
if p.name not in ("index.md", "log.md", "lint-report.md")]
def extract_wikilinks(content: str) -> list[str]:
return list(set(re.findall(r'\[\[([^\]]+)\]\]', content)))
def extract_frontmatter_type(content: str) -> str:
match = re.search(r'^type:\s*(\S+)', content, re.MULTILINE)
return match.group(1).strip('"\'') if match else "unknown"
def page_id(path: Path) -> str:
return path.relative_to(WIKI_DIR).as_posix().replace(".md", "")
def load_cache() -> dict:
if CACHE_FILE.exists():
try:
return json.loads(CACHE_FILE.read_text())
except (json.JSONDecodeError, IOError):
return {}
return {}
def save_cache(cache: dict):
GRAPH_DIR.mkdir(parents=True, exist_ok=True)
CACHE_FILE.write_text(json.dumps(cache, indent=2))
def build_nodes(pages: list[Path]) -> list[dict]:
nodes = []
for p in pages:
content = read_file(p)
node_type = extract_frontmatter_type(content)
title_match = re.search(r'^title:\s*"?([^"\n]+)"?', content, re.MULTILINE)
label = title_match.group(1).strip() if title_match else p.stem
nodes.append({
"id": page_id(p),
"label": label,
"type": node_type,
"color": TYPE_COLORS.get(node_type, TYPE_COLORS["unknown"]),
"path": str(p.relative_to(REPO_ROOT)),
})
return nodes
def build_extracted_edges(pages: list[Path]) -> list[dict]:
"""Pass 1: deterministic wikilink edges."""
# Build a map from stem (lower) -> page_id for resolution
stem_map = {p.stem.lower(): page_id(p) for p in pages}
edges = []
seen = set()
for p in pages:
content = read_file(p)
src = page_id(p)
for link in extract_wikilinks(content):
target = stem_map.get(link.lower())
if target and target != src:
key = (src, target)
if key not in seen:
seen.add(key)
edges.append({
"from": src,
"to": target,
"type": "EXTRACTED",
"color": EDGE_COLORS["EXTRACTED"],
"confidence": 1.0,
})
return edges
def build_inferred_edges(pages: list[Path], existing_edges: list[dict], cache: dict) -> list[dict]:
"""Pass 2: API-inferred semantic relationships."""
new_edges = []
# Only process pages that changed since last run
changed_pages = []
for p in pages:
content = read_file(p)
h = sha256(content)
entry = cache.get(str(p))
if not isinstance(entry, dict) or entry.get("hash") != h:
changed_pages.append(p)
else:
# Page unchanged: load its inferred edges from cache perfectly
src = page_id(p)
for rel in entry.get("edges", []):
new_edges.append({
"from": src,
"to": rel["to"],
"type": rel.get("type", "INFERRED"),
"title": rel.get("relationship", ""),
"label": "",
"color": EDGE_COLORS.get(rel.get("type", "INFERRED"), EDGE_COLORS["INFERRED"]),
"confidence": float(rel.get("confidence", 0.7)),
})
if not changed_pages:
print(" no changed pages — skipping semantic inference")
return []
print(f" inferring relationships for {len(changed_pages)} changed pages...")
# Build a summary of existing nodes for context
node_list = "\n".join(f"- {page_id(p)} ({extract_frontmatter_type(read_file(p))})" for p in pages)
existing_edge_summary = "\n".join(
f"- {e['from']}{e['to']} (EXTRACTED)" for e in existing_edges[:30]
)
for p in changed_pages:
content = read_file(p)[:2000] # truncate for context efficiency
src = page_id(p)
prompt = f"""Analyze this wiki page and identify implicit semantic relationships to other pages in the wiki.
Source page: {src}
Content:
{content}
All available pages:
{node_list}
Already-extracted edges from this page:
{existing_edge_summary}
Return ONLY a JSON array of NEW relationships not already captured by explicit wikilinks:
[
{{"to": "page-id", "relationship": "one-line description", "confidence": 0.0-1.0, "type": "INFERRED or AMBIGUOUS"}}
]
Rules:
- Only include pages from the available list above
- Confidence >= 0.7 → INFERRED, < 0.7 → AMBIGUOUS
- Do not repeat edges already in the extracted list
- Return empty array [] if no new relationships found
"""
raw = call_llm(prompt, "LLM_MODEL_FAST", "claude-3-5-haiku-latest", max_tokens=1024)
raw = raw.strip()
raw = re.sub(r"^```(?:json)?\s*", "", raw)
raw = re.sub(r"\s*```$", "", raw)
try:
inferred = json.loads(raw)
valid_rels = []
for rel in inferred:
if isinstance(rel, dict) and "to" in rel:
new_edges.append({
"from": src,
"to": rel["to"],
"type": rel.get("type", "INFERRED"),
"title": rel.get("relationship", ""),
"label": "",
"color": EDGE_COLORS.get(rel.get("type", "INFERRED"), EDGE_COLORS["INFERRED"]),
"confidence": float(rel.get("confidence", 0.7)),
})
valid_rels.append(rel)
# Save properly to cache
cache[str(p)] = {
"hash": sha256(content),
"edges": valid_rels
}
except (json.JSONDecodeError, TypeError, ValueError):
pass
return new_edges
def detect_communities(nodes: list[dict], edges: list[dict]) -> dict[str, int]:
"""Assign community IDs to nodes using Louvain algorithm."""
if not HAS_NETWORKX:
return {}
G = nx.Graph()
for n in nodes:
G.add_node(n["id"])
for e in edges:
G.add_edge(e["from"], e["to"])
if G.number_of_edges() == 0:
return {}
try:
communities = nx_community.louvain_communities(G, seed=42)
node_to_community = {}
for i, comm in enumerate(communities):
for node in comm:
node_to_community[node] = i
return node_to_community
except Exception:
return {}
COMMUNITY_COLORS = [
"#E91E63", "#00BCD4", "#8BC34A", "#FF5722", "#673AB7",
"#FFC107", "#009688", "#F44336", "#3F51B5", "#CDDC39",
]
def render_html(nodes: list[dict], edges: list[dict]) -> str:
"""Generate self-contained vis.js HTML."""
nodes_json = json.dumps(nodes, indent=2)
edges_json = json.dumps(edges, indent=2)
legend_items = "".join(
f'<span style="background:{color};padding:3px 8px;margin:2px;border-radius:3px;font-size:12px">{t}</span>'
for t, color in TYPE_COLORS.items() if t != "unknown"
)
return f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>LLM Wiki — Knowledge Graph</title>
<script src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
<style>
body {{ margin: 0; background: #1a1a2e; font-family: sans-serif; color: #eee; }}
#graph {{ width: 100vw; height: 100vh; }}
#controls {{
position: fixed; top: 10px; left: 10px; background: rgba(0,0,0,0.7);
padding: 12px; border-radius: 8px; z-index: 10; max-width: 260px;
}}
#controls h3 {{ margin: 0 0 8px; font-size: 14px; }}
#search {{ width: 100%; padding: 4px; margin-bottom: 8px; background: #333; color: #eee; border: 1px solid #555; border-radius: 4px; }}
#info {{
position: fixed; bottom: 10px; left: 10px; background: rgba(0,0,0,0.8);
padding: 12px; border-radius: 8px; z-index: 10; max-width: 320px;
display: none;
}}
#stats {{ position: fixed; top: 10px; right: 10px; background: rgba(0,0,0,0.7); padding: 10px; border-radius: 8px; font-size: 12px; }}
</style>
</head>
<body>
<div id="controls">
<h3>LLM Wiki Graph</h3>
<input id="search" type="text" placeholder="Search nodes..." oninput="searchNodes(this.value)">
<div>{legend_items}</div>
<div style="margin-top:8px;font-size:11px;color:#aaa">
<span style="background:#555;padding:2px 6px;border-radius:3px;margin-right:4px">──</span> Explicit link<br>
<span style="background:#FF5722;padding:2px 6px;border-radius:3px;margin-right:4px">──</span> Inferred
</div>
</div>
<div id="graph"></div>
<div id="info">
<b id="info-title"></b><br>
<span id="info-type" style="font-size:12px;color:#aaa"></span><br>
<span id="info-path" style="font-size:11px;color:#666"></span>
</div>
<div id="stats"></div>
<script>
const nodes = new vis.DataSet({nodes_json});
const edges = new vis.DataSet({edges_json});
const container = document.getElementById("graph");
const network = new vis.Network(container, {{ nodes, edges }}, {{
nodes: {{
shape: "dot",
size: 12,
font: {{ color: "#eee", size: 13 }},
borderWidth: 2,
}},
edges: {{
width: 1.2,
smooth: {{ type: "continuous" }},
arrows: {{ to: {{ enabled: true, scaleFactor: 0.5 }} }},
}},
physics: {{
stabilization: {{ iterations: 150 }},
barnesHut: {{ gravitationalConstant: -8000, springLength: 120 }},
}},
interaction: {{ hover: true, tooltipDelay: 200 }},
}});
network.on("click", params => {{
if (params.nodes.length > 0) {{
const node = nodes.get(params.nodes[0]);
document.getElementById("info").style.display = "block";
document.getElementById("info-title").textContent = node.label;
document.getElementById("info-type").textContent = node.type;
document.getElementById("info-path").textContent = node.path;
}} else {{
document.getElementById("info").style.display = "none";
}}
}});
document.getElementById("stats").textContent =
`${{nodes.length}} nodes · ${{edges.length}} edges`;
function searchNodes(q) {{
const lower = q.toLowerCase();
nodes.forEach(n => {{
nodes.update({{ id: n.id, opacity: (!q || n.label.toLowerCase().includes(lower)) ? 1 : 0.15 }});
}});
}}
</script>
</body>
</html>"""
def append_log(entry: str):
log_path = WIKI_DIR / "log.md"
existing = read_file(log_path)
log_path.write_text(entry.strip() + "\n\n" + existing, encoding="utf-8")
def build_graph(infer: bool = True, open_browser: bool = False):
pages = all_wiki_pages()
today = date.today().isoformat()
if not pages:
print("Wiki is empty. Ingest some sources first.")
return
print(f"Building graph from {len(pages)} wiki pages...")
GRAPH_DIR.mkdir(parents=True, exist_ok=True)
cache = load_cache()
# Pass 1: extracted edges
print(" Pass 1: extracting wikilinks...")
nodes = build_nodes(pages)
edges = build_extracted_edges(pages)
print(f"{len(edges)} extracted edges")
# Pass 2: inferred edges
if infer:
print(" Pass 2: inferring semantic relationships...")
inferred = build_inferred_edges(pages, edges, cache)
edges.extend(inferred)
print(f"{len(inferred)} inferred edges")
save_cache(cache)
# Community detection
print(" Running Louvain community detection...")
communities = detect_communities(nodes, edges)
for node in nodes:
comm_id = communities.get(node["id"], -1)
if comm_id >= 0:
node["color"] = COMMUNITY_COLORS[comm_id % len(COMMUNITY_COLORS)]
node["group"] = comm_id
# Save graph.json
graph_data = {"nodes": nodes, "edges": edges, "built": today}
GRAPH_JSON.write_text(json.dumps(graph_data, indent=2))
print(f" saved: graph/graph.json ({len(nodes)} nodes, {len(edges)} edges)")
# Save graph.html
html = render_html(nodes, edges)
GRAPH_HTML.write_text(html)
print(f" saved: graph/graph.html")
append_log(f"## [{today}] graph | Knowledge graph rebuilt\n\n{len(nodes)} nodes, {len(edges)} edges ({len([e for e in edges if e['type']=='EXTRACTED'])} extracted, {len([e for e in edges if e['type']=='INFERRED'])} inferred).")
if open_browser:
webbrowser.open(f"file://{GRAPH_HTML.resolve()}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Build LLM Wiki knowledge graph")
parser.add_argument("--no-infer", action="store_true", help="Skip semantic inference (faster)")
parser.add_argument("--open", action="store_true", help="Open graph.html in browser")
args = parser.parse_args()
build_graph(infer=not args.no_infer, open_browser=args.open)

100
tools/heal.py Executable file
View File

@@ -0,0 +1,100 @@
#!/usr/bin/env python3
"""
Graph Self-Healing Tool
Automatically retrieves "Missing Entity Pages" from the wiki and generates
comprehensive definition pages for them using the LLM.
It resolves broken entity links by scanning existing contexts where the entity is referenced.
Usage:
python tools/heal.py
"""
import os
import sys
from pathlib import Path
try:
from litellm import completion
except ImportError:
print("Error: litellm not installed. Run: pip install litellm")
sys.exit(1)
# Ensure tools can be imported
sys.path.insert(0, str(Path(__file__).parent.parent))
from tools.lint import find_missing_entities, all_wiki_pages
REPO_ROOT = Path(__file__).parent.parent
WIKI_DIR = REPO_ROOT / "wiki"
ENTITIES_DIR = WIKI_DIR / "entities"
def call_llm(prompt: str, max_tokens: int = 1500) -> str:
# Use litellm standard environment variables
# e.g., GEMINI_API_KEY, ANTHROPIC_API_KEY, OPENAI_API_KEY
model = os.getenv("LLM_MODEL", "claude-3-5-haiku-latest") # default to fast model
response = completion(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens
)
return response.choices[0].message.content
def search_sources(entity: str, pages: list[Path]) -> list[Path]:
"""Find up to 15 pages where this entity is mentioned natively."""
sources = []
for p in pages:
if "entities" not in str(p.parent) and "concepts" not in str(p.parent):
content = p.read_text(encoding="utf-8")
if entity.lower() in content.lower():
sources.append(p)
return sources[:15]
def heal_missing_entities():
pages = all_wiki_pages()
missing_entities = find_missing_entities(pages)
if not missing_entities:
print("Graph is fully connected. No missing entities found!")
return
ENTITIES_DIR.mkdir(exist_ok=True, parents=True)
print(f"Found {len(missing_entities)} missing entity nodes. Commencing auto-heal...")
for entity in missing_entities:
print(f"Healing entity page for: {entity}")
sources = search_sources(entity, pages)
context = ""
for s in sources:
context += f"\n\n### {s.name}\n{s.read_text(encoding='utf-8')[:800]}"
prompt = f"""You are filling a data gap in the Personal LLM Wiki.
Create an Entity definition page for "{entity}".
Here is how the entity appears in the current sources:
{context}
Format:
---
title: "{entity}"
type: entity
tags: []
sources: {[s.name for s in sources]}
---
# {entity}
Write a comprehensive paragraph defining what `{entity}` means in the context of this wiki, its main significance, and any actions or associations related to it.
"""
try:
result = call_llm(prompt)
out_path = ENTITIES_DIR / f"{entity}.md"
out_path.write_text(result, encoding="utf-8")
print(f" -> Saved to {out_path.relative_to(REPO_ROOT)}")
except Exception as e:
print(f" [!] Failed to generate {entity}: {e}")
if __name__ == "__main__":
heal_missing_entities()

239
tools/ingest.py Normal file
View File

@@ -0,0 +1,239 @@
#!/usr/bin/env python3
"""
Ingest a source document into the LLM Wiki.
Usage:
python tools/ingest.py <path-to-source>
python tools/ingest.py raw/articles/my-article.md
The LLM reads the source, extracts knowledge, and updates the wiki:
- Creates wiki/sources/<slug>.md
- Updates wiki/index.md
- Updates wiki/overview.md (if warranted)
- Creates/updates entity and concept pages
- Appends to wiki/log.md
- Flags contradictions
"""
import os
import sys
import json
import hashlib
import re
from pathlib import Path
from datetime import date
import os
REPO_ROOT = Path(__file__).parent.parent
WIKI_DIR = REPO_ROOT / "wiki"
LOG_FILE = WIKI_DIR / "log.md"
INDEX_FILE = WIKI_DIR / "index.md"
OVERVIEW_FILE = WIKI_DIR / "overview.md"
SCHEMA_FILE = REPO_ROOT / "CLAUDE.md"
def sha256(text: str) -> str:
return hashlib.sha256(text.encode()).hexdigest()[:16]
def read_file(path: Path) -> str:
return path.read_text(encoding="utf-8") if path.exists() else ""
def call_llm(prompt: str, max_tokens: int = 8192) -> str:
try:
from litellm import completion
except ImportError:
print("Error: litellm not installed. Run: pip install litellm")
sys.exit(1)
model = os.getenv("LLM_MODEL", "claude-3-5-sonnet-latest")
response = completion(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens
)
return response.choices[0].message.content
def write_file(path: Path, content: str):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
print(f" wrote: {path.relative_to(REPO_ROOT)}")
def build_wiki_context() -> str:
parts = []
if INDEX_FILE.exists():
parts.append(f"## wiki/index.md\n{read_file(INDEX_FILE)}")
if OVERVIEW_FILE.exists():
parts.append(f"## wiki/overview.md\n{read_file(OVERVIEW_FILE)}")
# Include a few recent source pages for contradiction checking
sources_dir = WIKI_DIR / "sources"
if sources_dir.exists():
recent = sorted(sources_dir.glob("*.md"), key=lambda p: p.stat().st_mtime, reverse=True)[:5]
for p in recent:
parts.append(f"## {p.relative_to(REPO_ROOT)}\n{p.read_text()}")
return "\n\n---\n\n".join(parts)
def parse_json_from_response(text: str) -> dict:
# Strip markdown code fences if present
text = re.sub(r"^```(?:json)?\s*", "", text.strip())
text = re.sub(r"\s*```$", "", text.strip())
# Find the outermost JSON object
match = re.search(r"\{[\s\S]*\}", text)
if not match:
raise ValueError("No JSON object found in response")
return json.loads(match.group())
def update_index(new_entry: str, section: str = "Sources"):
content = read_file(INDEX_FILE)
if not content:
content = "# Wiki Index\n\n## Overview\n- [Overview](overview.md) — living synthesis\n\n## Sources\n\n## Entities\n\n## Concepts\n\n## Syntheses\n"
section_header = f"## {section}"
if section_header in content:
content = content.replace(section_header + "\n", section_header + "\n" + new_entry + "\n")
else:
content += f"\n{section_header}\n{new_entry}\n"
write_file(INDEX_FILE, content)
def append_log(entry: str):
existing = read_file(LOG_FILE)
write_file(LOG_FILE, entry.strip() + "\n\n" + existing)
def ingest(source_path: str):
source = Path(source_path)
if not source.exists():
print(f"Error: file not found: {source_path}")
sys.exit(1)
source_content = source.read_text(encoding="utf-8")
source_hash = sha256(source_content)
today = date.today().isoformat()
print(f"\nIngesting: {source.name} (hash: {source_hash})")
wiki_context = build_wiki_context()
schema = read_file(SCHEMA_FILE)
schema = read_file(SCHEMA_FILE)
prompt = f"""You are maintaining an LLM Wiki. Process this source document and integrate its knowledge into the wiki.
Schema and conventions:
{schema}
Current wiki state (index + recent pages):
{wiki_context if wiki_context else "(wiki is empty — this is the first source)"}
New source to ingest (file: {source.relative_to(REPO_ROOT) if source.is_relative_to(REPO_ROOT) else source.name}):
=== SOURCE START ===
{source_content}
=== SOURCE END ===
Today's date: {today}
Return ONLY a valid JSON object with these fields (no markdown fences, no prose outside the JSON):
{{
"title": "Human-readable title for this source",
"slug": "kebab-case-slug-for-filename",
"source_page": "full markdown content for wiki/sources/<slug>.md — use the source page format from the schema",
"index_entry": "- [Title](sources/slug.md) — one-line summary",
"overview_update": "full updated content for wiki/overview.md, or null if no update needed",
"entity_pages": [
{{"path": "entities/EntityName.md", "content": "full markdown content"}}
],
"concept_pages": [
{{"path": "concepts/ConceptName.md", "content": "full markdown content"}}
],
"contradictions": ["describe any contradiction with existing wiki content, or empty list"],
"log_entry": "## [{today}] ingest | <title>\\n\\nAdded source. Key claims: ..."
}}
"""
print(f" calling API (model: ...)")
raw = call_llm(prompt, max_tokens=8192)
try:
data = parse_json_from_response(raw)
except (ValueError, json.JSONDecodeError) as e:
print(f"Error parsing API response: {e}")
print("Raw response saved to /tmp/ingest_debug.txt")
Path("/tmp/ingest_debug.txt").write_text(raw)
sys.exit(1)
# Write source page
slug = data["slug"]
write_file(WIKI_DIR / "sources" / f"{slug}.md", data["source_page"])
# Write entity pages
for page in data.get("entity_pages", []):
write_file(WIKI_DIR / page["path"], page["content"])
# Write concept pages
for page in data.get("concept_pages", []):
write_file(WIKI_DIR / page["path"], page["content"])
# Update overview
if data.get("overview_update"):
write_file(OVERVIEW_FILE, data["overview_update"])
# Update index
update_index(data["index_entry"], section="Sources")
# Append log
append_log(data["log_entry"])
# Report contradictions
contradictions = data.get("contradictions", [])
if contradictions:
print("\n ⚠️ Contradictions detected:")
for c in contradictions:
print(f" - {c}")
print(f"\nDone. Ingested: {data['title']}")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python tools/ingest.py <path-to-source> [path2 ...] [dir1 ...]")
sys.exit(1)
paths_to_process = []
for arg in sys.argv[1:]:
p = Path(arg)
if p.is_file() and p.suffix == ".md":
paths_to_process.append(p)
elif p.is_dir():
for f in p.rglob("*.md"):
if f.is_file():
paths_to_process.append(f)
else:
import glob
for f in glob.glob(arg, recursive=True):
g_p = Path(f)
if g_p.is_file() and g_p.suffix == ".md":
paths_to_process.append(g_p)
# Deduplicate while preserving order
unique_paths = []
seen = set()
for p in paths_to_process:
abs_p = p.resolve()
if abs_p not in seen:
seen.add(abs_p)
unique_paths.append(p)
if not unique_paths:
print("Error: no markdown files found to ingest.")
sys.exit(1)
if len(unique_paths) > 1:
print(f"Batch mode: found {len(unique_paths)} files to ingest.")
for p in unique_paths:
ingest(str(p))

210
tools/lint.py Normal file
View File

@@ -0,0 +1,210 @@
#!/usr/bin/env python3
"""
Lint the LLM Wiki for health issues.
Usage:
python tools/lint.py
python tools/lint.py --save # save lint report to wiki/lint-report.md
Checks:
- Orphan pages (no inbound wikilinks from other pages)
- Broken wikilinks (pointing to pages that don't exist)
- Missing entity pages (entities mentioned in 3+ pages but no page)
- Contradictions between pages
- Data gaps and suggested new sources
"""
import re
import sys
import argparse
from pathlib import Path
from collections import defaultdict
from datetime import date
import os
REPO_ROOT = Path(__file__).parent.parent
WIKI_DIR = REPO_ROOT / "wiki"
LOG_FILE = WIKI_DIR / "log.md"
SCHEMA_FILE = REPO_ROOT / "CLAUDE.md"
def read_file(path: Path) -> str:
return path.read_text(encoding="utf-8") if path.exists() else ""
def call_llm(prompt: str, model_env: str, default_model: str, max_tokens: int = 4096) -> str:
try:
from litellm import completion
except ImportError:
print("Error: litellm not installed. Run: pip install litellm")
sys.exit(1)
model = os.getenv(model_env, default_model)
response = completion(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens
)
return response.choices[0].message.content
def all_wiki_pages() -> list[Path]:
return [p for p in WIKI_DIR.rglob("*.md")
if p.name not in ("index.md", "log.md", "lint-report.md")]
def extract_wikilinks(content: str) -> list[str]:
return re.findall(r'\[\[([^\]]+)\]\]', content)
def page_name_to_path(name: str) -> list[Path]:
"""Try to resolve a [[WikiLink]] to a file path."""
candidates = []
for p in all_wiki_pages():
if p.stem.lower() == name.lower() or p.stem == name:
candidates.append(p)
return candidates
def find_orphans(pages: list[Path]) -> list[Path]:
inbound = defaultdict(int)
for p in pages:
content = read_file(p)
for link in extract_wikilinks(content):
resolved = page_name_to_path(link)
for r in resolved:
inbound[r] += 1
return [p for p in pages if inbound[p] == 0 and p != WIKI_DIR / "overview.md"]
def find_broken_links(pages: list[Path]) -> list[tuple[Path, str]]:
broken = []
for p in pages:
content = read_file(p)
for link in extract_wikilinks(content):
if not page_name_to_path(link):
broken.append((p, link))
return broken
def find_missing_entities(pages: list[Path]) -> list[str]:
"""Find entity-like names mentioned in 3+ pages but lacking their own page."""
mention_counts: dict[str, int] = defaultdict(int)
existing_pages = {p.stem.lower() for p in pages}
for p in pages:
content = read_file(p)
links = extract_wikilinks(content)
for link in links:
if link.lower() not in existing_pages:
mention_counts[link] += 1
return [name for name, count in mention_counts.items() if count >= 3]
def run_lint():
pages = all_wiki_pages()
today = date.today().isoformat()
if not pages:
print("Wiki is empty. Nothing to lint.")
return ""
print(f"Linting {len(pages)} wiki pages...")
# Deterministic checks
orphans = find_orphans(pages)
broken = find_broken_links(pages)
missing_entities = find_missing_entities(pages)
print(f" orphans: {len(orphans)}")
print(f" broken links: {len(broken)}")
print(f" missing entity pages: {len(missing_entities)}")
# Build context for semantic checks (contradictions, gaps)
# Use a sample of pages to stay within context limits
sample = pages[:20]
pages_context = ""
for p in sample:
rel = p.relative_to(REPO_ROOT)
pages_context += f"\n\n### {rel}\n{read_file(p)[:1500]}" # truncate long pages
print(" running semantic lint via API...")
prompt = f"""You are linting an LLM Wiki. Review the pages below and identify:
1. Contradictions between pages (claims that conflict)
2. Stale content (summaries that newer sources have superseded)
3. Data gaps (important questions the wiki can't answer — suggest specific sources to find)
4. Concepts mentioned but lacking depth
Wiki pages (sample of {len(sample)} pages):
{pages_context}
Return a markdown lint report with these sections:
## Contradictions
## Stale Content
## Data Gaps & Suggested Sources
## Concepts Needing More Depth
Be specific — name the exact pages and claims involved.
"""
semantic_report = call_llm(prompt, "LLM_MODEL", "claude-3-5-sonnet-latest", max_tokens=3000)
# Compose full report
report_lines = [
f"# Wiki Lint Report — {today}",
"",
f"Scanned {len(pages)} pages.",
"",
"## Structural Issues",
"",
]
if orphans:
report_lines.append("### Orphan Pages (no inbound links)")
for p in orphans:
report_lines.append(f"- `{p.relative_to(REPO_ROOT)}`")
report_lines.append("")
if broken:
report_lines.append("### Broken Wikilinks")
for page, link in broken:
report_lines.append(f"- `{page.relative_to(REPO_ROOT)}` links to `[[{link}]]` — not found")
report_lines.append("")
if missing_entities:
report_lines.append("### Missing Entity Pages (mentioned 3+ times but no page)")
for name in missing_entities:
report_lines.append(f"- `[[{name}]]`")
report_lines.append("")
if not orphans and not broken and not missing_entities:
report_lines.append("No structural issues found.")
report_lines.append("")
report_lines.append("---")
report_lines.append("")
report_lines.append(semantic_report)
report = "\n".join(report_lines)
print("\n" + report)
return report
def append_log(entry: str):
existing = read_file(LOG_FILE)
LOG_FILE.write_text(entry.strip() + "\n\n" + existing, encoding="utf-8")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Lint the LLM Wiki")
parser.add_argument("--save", action="store_true", help="Save lint report to wiki/lint-report.md")
args = parser.parse_args()
report = run_lint()
if args.save and report:
report_path = WIKI_DIR / "lint-report.md"
report_path.write_text(report, encoding="utf-8")
print(f"\nSaved: {report_path.relative_to(REPO_ROOT)}")
today = date.today().isoformat()
append_log(f"## [{today}] lint | Wiki health check\n\nRan lint. See lint-report.md for details.")

192
tools/query.py Normal file
View File

@@ -0,0 +1,192 @@
#!/usr/bin/env python3
"""
Query the LLM Wiki.
Usage:
python tools/query.py "What are the main themes across all sources?"
python tools/query.py "How does ConceptA relate to ConceptB?" --save
python tools/query.py "Summarize everything about EntityName" --save synthesis/my-analysis.md
Flags:
--save Save the answer back into the wiki (prompts for filename)
--save <path> Save to a specific wiki path
"""
import sys
import re
import json
import argparse
from pathlib import Path
from datetime import date
import os
REPO_ROOT = Path(__file__).parent.parent
WIKI_DIR = REPO_ROOT / "wiki"
INDEX_FILE = WIKI_DIR / "index.md"
LOG_FILE = WIKI_DIR / "log.md"
SCHEMA_FILE = REPO_ROOT / "CLAUDE.md"
def read_file(path: Path) -> str:
return path.read_text(encoding="utf-8") if path.exists() else ""
def write_file(path: Path, content: str):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
print(f" saved: {path.relative_to(REPO_ROOT)}")
def call_llm(prompt: str, model_env: str, default_model: str, max_tokens: int = 4096) -> str:
try:
from litellm import completion
except ImportError:
print("Error: litellm not installed. Run: pip install litellm")
sys.exit(1)
model = os.getenv(model_env, default_model)
response = completion(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens
)
return response.choices[0].message.content
def find_relevant_pages(question: str, index_content: str) -> list[Path]:
"""Extract linked pages from index that seem relevant to the question."""
# Pull all [[links]] and markdown links from index
md_links = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', index_content)
question_lower = question.lower()
relevant = []
for title, href in md_links:
title_lower = title.lower()
match = False
# 1. English/Space-separated: check words > 3 chars
if any(word in question_lower for word in title_lower.split() if len(word) > 3):
match = True
# 2. Exact substring match for the whole title (useful for short CJK titles, e.g. len=2)
elif len(title_lower) >= 2 and title_lower in question_lower:
match = True
# 3. CJK chunks: find contiguous non-ASCII characters (len >= 2) and check if in question
elif any(chunk in question_lower for chunk in re.findall(r'[^\x00-\x7F]{2,}', title_lower)):
match = True
if match:
p = WIKI_DIR / href
if p.exists() and p not in relevant:
relevant.append(p)
# Always include overview
overview = WIKI_DIR / "overview.md"
if overview.exists() and overview not in relevant:
relevant.insert(0, overview)
return relevant[:12] # cap to avoid context overflow
def append_log(entry: str):
existing = read_file(LOG_FILE)
LOG_FILE.write_text(entry.strip() + "\n\n" + existing, encoding="utf-8")
def query(question: str, save_path: str | None = None):
today = date.today().isoformat()
# Step 1: Read index
index_content = read_file(INDEX_FILE)
if not index_content:
print("Wiki is empty. Ingest some sources first with: python tools/ingest.py <source>")
sys.exit(1)
# Step 2: Find relevant pages
relevant_pages = find_relevant_pages(question, index_content)
# If no keyword match, ask Claude to identify relevant pages from the index
if not relevant_pages or len(relevant_pages) <= 1:
print(" selecting relevant pages via API...")
prompt = f"Given this wiki index:\n\n{index_content}\n\nWhich pages are most relevant to answering: \"{question}\"\n\nReturn ONLY a JSON array of relative file paths (as listed in the index), e.g. [\"sources/foo.md\", \"concepts/Bar.md\"]. Maximum 10 pages."
raw = call_llm(prompt, "LLM_MODEL_FAST", "claude-3-5-haiku-latest", max_tokens=512)
raw = raw.strip()
raw = re.sub(r"^```(?:json)?\s*", "", raw)
raw = re.sub(r"\s*```$", "", raw)
try:
paths = json.loads(raw)
relevant_pages = [WIKI_DIR / p for p in paths if (WIKI_DIR / p).exists()]
except (json.JSONDecodeError, TypeError):
pass
# Step 3: Read relevant pages
pages_context = ""
for p in relevant_pages:
rel = p.relative_to(REPO_ROOT)
pages_context += f"\n\n### {rel}\n{p.read_text(encoding='utf-8')}"
if not pages_context:
pages_context = f"\n\n### wiki/index.md\n{index_content}"
schema = read_file(SCHEMA_FILE)
# Step 4: Synthesize answer
print(f" synthesizing answer from {len(relevant_pages)} pages...")
prompt = f"""You are querying an LLM Wiki to answer a question. Use the wiki pages below to synthesize a thorough answer. Cite sources using [[PageName]] wikilink syntax.
Schema:
{schema}
Wiki pages:
{pages_context}
Question: {question}
Write a well-structured markdown answer with headers, bullets, and [[wikilink]] citations. At the end, add a ## Sources section listing the pages you drew from.
"""
answer = call_llm(prompt, "LLM_MODEL", "claude-3-5-sonnet-latest", max_tokens=4096)
print("\n" + "=" * 60)
print(answer)
print("=" * 60)
# Step 5: Optionally save answer
if save_path is not None:
if save_path == "":
# Prompt for filename
slug = input("\nSave as (slug, e.g. 'my-analysis'): ").strip()
if not slug:
print("Skipping save.")
return
save_path = f"syntheses/{slug}.md"
full_save_path = WIKI_DIR / save_path
frontmatter = f"""---
title: "{question[:80]}"
type: synthesis
tags: []
sources: []
last_updated: {today}
---
"""
write_file(full_save_path, frontmatter + answer)
# Update index
index_content = read_file(INDEX_FILE)
entry = f"- [{question[:60]}]({save_path}) — synthesis"
if "## Syntheses" in index_content:
index_content = index_content.replace("## Syntheses\n", f"## Syntheses\n{entry}\n")
INDEX_FILE.write_text(index_content, encoding="utf-8")
print(f" indexed: {save_path}")
# Append to log
append_log(f"## [{today}] query | {question[:80]}\n\nSynthesized answer from {len(relevant_pages)} pages." +
(f" Saved to {save_path}." if save_path else ""))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Query the LLM Wiki")
parser.add_argument("question", help="Question to ask the wiki")
parser.add_argument("--save", nargs="?", const="", default=None,
help="Save answer to wiki (optionally specify path)")
args = parser.parse_args()
query(args.question, args.save)