#!/usr/bin/env python3 """ Build the knowledge graph from the wiki. Usage: python tools/build_graph.py # full rebuild python tools/build_graph.py --no-infer # skip semantic inference (faster) python tools/build_graph.py --open # open graph.html in browser after build Outputs: graph/graph.json — node/edge data (cached by SHA256) graph/graph.html — interactive vis.js visualization Edge types: EXTRACTED — explicit [[wikilink]] in a page INFERRED — Claude-detected implicit relationship AMBIGUOUS — low-confidence inferred relationship """ import re import json import hashlib import argparse import webbrowser from pathlib import Path from datetime import date import anthropic try: import networkx as nx from networkx.algorithms import community as nx_community HAS_NETWORKX = True except ImportError: HAS_NETWORKX = False print("Warning: networkx not installed. Community detection disabled. Run: pip install networkx") REPO_ROOT = Path(__file__).parent.parent WIKI_DIR = REPO_ROOT / "wiki" GRAPH_DIR = REPO_ROOT / "graph" GRAPH_JSON = GRAPH_DIR / "graph.json" GRAPH_HTML = GRAPH_DIR / "graph.html" CACHE_FILE = GRAPH_DIR / ".cache.json" LOG_FILE = WIKI_DIR / "log.md" SCHEMA_FILE = REPO_ROOT / "CLAUDE.md" # Node type → color mapping TYPE_COLORS = { "source": "#4CAF50", "entity": "#2196F3", "concept": "#FF9800", "synthesis": "#9C27B0", "unknown": "#9E9E9E", } EDGE_COLORS = { "EXTRACTED": "#555555", "INFERRED": "#FF5722", "AMBIGUOUS": "#BDBDBD", } def read_file(path: Path) -> str: return path.read_text(encoding="utf-8") if path.exists() else "" def sha256(text: str) -> str: return hashlib.sha256(text.encode()).hexdigest() def all_wiki_pages() -> list[Path]: return [p for p in WIKI_DIR.rglob("*.md") if p.name not in ("index.md", "log.md", "lint-report.md")] def extract_wikilinks(content: str) -> list[str]: return list(set(re.findall(r'\[\[([^\]]+)\]\]', content))) def extract_frontmatter_type(content: str) -> str: match = re.search(r'^type:\s*(\S+)', content, re.MULTILINE) return match.group(1).strip('"\'') if match else "unknown" def page_id(path: Path) -> str: return path.relative_to(WIKI_DIR).as_posix().replace(".md", "") def load_cache() -> dict: if CACHE_FILE.exists(): try: return json.loads(CACHE_FILE.read_text()) except (json.JSONDecodeError, IOError): return {} return {} def save_cache(cache: dict): GRAPH_DIR.mkdir(parents=True, exist_ok=True) CACHE_FILE.write_text(json.dumps(cache, indent=2)) def build_nodes(pages: list[Path]) -> list[dict]: nodes = [] for p in pages: content = read_file(p) node_type = extract_frontmatter_type(content) title_match = re.search(r'^title:\s*"?([^"\n]+)"?', content, re.MULTILINE) label = title_match.group(1).strip() if title_match else p.stem nodes.append({ "id": page_id(p), "label": label, "type": node_type, "color": TYPE_COLORS.get(node_type, TYPE_COLORS["unknown"]), "path": str(p.relative_to(REPO_ROOT)), }) return nodes def build_extracted_edges(pages: list[Path]) -> list[dict]: """Pass 1: deterministic wikilink edges.""" # Build a map from stem (lower) -> page_id for resolution stem_map = {p.stem.lower(): page_id(p) for p in pages} edges = [] seen = set() for p in pages: content = read_file(p) src = page_id(p) for link in extract_wikilinks(content): target = stem_map.get(link.lower()) if target and target != src: key = (src, target) if key not in seen: seen.add(key) edges.append({ "from": src, "to": target, "type": "EXTRACTED", "color": EDGE_COLORS["EXTRACTED"], "confidence": 1.0, }) return edges def build_inferred_edges(pages: list[Path], existing_edges: list[dict], cache: dict) -> list[dict]: """Pass 2: Claude-inferred semantic relationships.""" client = anthropic.Anthropic() new_edges = [] # Only process pages that changed since last run changed_pages = [] for p in pages: content = read_file(p) h = sha256(content) entry = cache.get(str(p)) if not isinstance(entry, dict) or entry.get("hash") != h: changed_pages.append(p) else: # Page unchanged: load its inferred edges from cache perfectly src = page_id(p) for rel in entry.get("edges", []): new_edges.append({ "from": src, "to": rel["to"], "type": rel.get("type", "INFERRED"), "title": rel.get("relationship", ""), "label": "", "color": EDGE_COLORS.get(rel.get("type", "INFERRED"), EDGE_COLORS["INFERRED"]), "confidence": float(rel.get("confidence", 0.7)), }) if not changed_pages: print(" no changed pages — skipping semantic inference") return [] print(f" inferring relationships for {len(changed_pages)} changed pages...") # Build a summary of existing nodes for context node_list = "\n".join(f"- {page_id(p)} ({extract_frontmatter_type(read_file(p))})" for p in pages) existing_edge_summary = "\n".join( f"- {e['from']} → {e['to']} (EXTRACTED)" for e in existing_edges[:30] ) for p in changed_pages: content = read_file(p)[:2000] # truncate for context efficiency src = page_id(p) response = client.messages.create( model="claude-haiku-4-5-20251001", max_tokens=1024, messages=[{ "role": "user", "content": f"""Analyze this wiki page and identify implicit semantic relationships to other pages in the wiki. Source page: {src} Content: {content} All available pages: {node_list} Already-extracted edges from this page: {existing_edge_summary} Return ONLY a JSON array of NEW relationships not already captured by explicit wikilinks: [ {{"to": "page-id", "relationship": "one-line description", "confidence": 0.0-1.0, "type": "INFERRED or AMBIGUOUS"}} ] Rules: - Only include pages from the available list above - Confidence >= 0.7 → INFERRED, < 0.7 → AMBIGUOUS - Do not repeat edges already in the extracted list - Return empty array [] if no new relationships found """ }] ) raw = response.content[0].text.strip() raw = re.sub(r"^```(?:json)?\s*", "", raw) raw = re.sub(r"\s*```$", "", raw) try: inferred = json.loads(raw) valid_rels = [] for rel in inferred: if isinstance(rel, dict) and "to" in rel: new_edges.append({ "from": src, "to": rel["to"], "type": rel.get("type", "INFERRED"), "title": rel.get("relationship", ""), "label": "", "color": EDGE_COLORS.get(rel.get("type", "INFERRED"), EDGE_COLORS["INFERRED"]), "confidence": float(rel.get("confidence", 0.7)), }) valid_rels.append(rel) # Save properly to cache cache[str(p)] = { "hash": sha256(content), "edges": valid_rels } except (json.JSONDecodeError, TypeError, ValueError): pass return new_edges def detect_communities(nodes: list[dict], edges: list[dict]) -> dict[str, int]: """Assign community IDs to nodes using Louvain algorithm.""" if not HAS_NETWORKX: return {} G = nx.Graph() for n in nodes: G.add_node(n["id"]) for e in edges: G.add_edge(e["from"], e["to"]) if G.number_of_edges() == 0: return {} try: communities = nx_community.louvain_communities(G, seed=42) node_to_community = {} for i, comm in enumerate(communities): for node in comm: node_to_community[node] = i return node_to_community except Exception: return {} COMMUNITY_COLORS = [ "#E91E63", "#00BCD4", "#8BC34A", "#FF5722", "#673AB7", "#FFC107", "#009688", "#F44336", "#3F51B5", "#CDDC39", ] def render_html(nodes: list[dict], edges: list[dict]) -> str: """Generate self-contained vis.js HTML.""" nodes_json = json.dumps(nodes, indent=2) edges_json = json.dumps(edges, indent=2) legend_items = "".join( f'{t}' for t, color in TYPE_COLORS.items() if t != "unknown" ) return f""" LLM Wiki — Knowledge Graph

LLM Wiki Graph

{legend_items}
── Explicit link
── Inferred


""" def append_log(entry: str): log_path = WIKI_DIR / "log.md" existing = read_file(log_path) log_path.write_text(entry.strip() + "\n\n" + existing, encoding="utf-8") def build_graph(infer: bool = True, open_browser: bool = False): pages = all_wiki_pages() today = date.today().isoformat() if not pages: print("Wiki is empty. Ingest some sources first.") return print(f"Building graph from {len(pages)} wiki pages...") GRAPH_DIR.mkdir(parents=True, exist_ok=True) cache = load_cache() # Pass 1: extracted edges print(" Pass 1: extracting wikilinks...") nodes = build_nodes(pages) edges = build_extracted_edges(pages) print(f" → {len(edges)} extracted edges") # Pass 2: inferred edges if infer: print(" Pass 2: inferring semantic relationships...") inferred = build_inferred_edges(pages, edges, cache) edges.extend(inferred) print(f" → {len(inferred)} inferred edges") save_cache(cache) # Community detection print(" Running Louvain community detection...") communities = detect_communities(nodes, edges) for node in nodes: comm_id = communities.get(node["id"], -1) if comm_id >= 0: node["color"] = COMMUNITY_COLORS[comm_id % len(COMMUNITY_COLORS)] node["group"] = comm_id # Save graph.json graph_data = {"nodes": nodes, "edges": edges, "built": today} GRAPH_JSON.write_text(json.dumps(graph_data, indent=2)) print(f" saved: graph/graph.json ({len(nodes)} nodes, {len(edges)} edges)") # Save graph.html html = render_html(nodes, edges) GRAPH_HTML.write_text(html) print(f" saved: graph/graph.html") append_log(f"## [{today}] graph | Knowledge graph rebuilt\n\n{len(nodes)} nodes, {len(edges)} edges ({len([e for e in edges if e['type']=='EXTRACTED'])} extracted, {len([e for e in edges if e['type']=='INFERRED'])} inferred).") if open_browser: webbrowser.open(f"file://{GRAPH_HTML.resolve()}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Build LLM Wiki knowledge graph") parser.add_argument("--no-infer", action="store_true", help="Skip semantic inference (faster)") parser.add_argument("--open", action="store_true", help="Open graph.html in browser") args = parser.parse_args() build_graph(infer=not args.no_infer, open_browser=args.open)