#!/usr/bin/env python3 """ Build the knowledge graph from the wiki. Usage: python tools/build_graph.py # full rebuild python tools/build_graph.py --no-infer # skip semantic inference (faster) python tools/build_graph.py --open # open graph.html in browser after build Outputs: graph/graph.json — node/edge data (cached by SHA256) graph/graph.html — interactive vis.js visualization Edge types: EXTRACTED — explicit [[wikilink]] in a page INFERRED — Claude-detected implicit relationship AMBIGUOUS — low-confidence inferred relationship """ import re import json import hashlib import argparse import webbrowser from pathlib import Path from datetime import date import os try: import networkx as nx from networkx.algorithms import community as nx_community HAS_NETWORKX = True except ImportError: HAS_NETWORKX = False print("Warning: networkx not installed. Community detection disabled. Run: pip install networkx") REPO_ROOT = Path(__file__).parent.parent WIKI_DIR = REPO_ROOT / "wiki" GRAPH_DIR = REPO_ROOT / "graph" GRAPH_JSON = GRAPH_DIR / "graph.json" GRAPH_HTML = GRAPH_DIR / "graph.html" CACHE_FILE = GRAPH_DIR / ".cache.json" LOG_FILE = WIKI_DIR / "log.md" SCHEMA_FILE = REPO_ROOT / "CLAUDE.md" # Node type → color mapping TYPE_COLORS = { "source": "#4CAF50", "entity": "#2196F3", "concept": "#FF9800", "synthesis": "#9C27B0", "unknown": "#9E9E9E", } EDGE_COLORS = { "EXTRACTED": "#555555", "INFERRED": "#FF5722", "AMBIGUOUS": "#BDBDBD", } def read_file(path: Path) -> str: return path.read_text(encoding="utf-8") if path.exists() else "" def call_llm(prompt: str, model_env: str, default_model: str, max_tokens: int = 4096) -> str: try: from litellm import completion except ImportError: print("Error: litellm not installed. Run: pip install litellm") import sys sys.exit(1) model = os.getenv(model_env, default_model) response = completion( model=model, messages=[{"role": "user", "content": prompt}], max_tokens=max_tokens ) return response.choices[0].message.content def sha256(text: str) -> str: return hashlib.sha256(text.encode()).hexdigest() def all_wiki_pages() -> list[Path]: return [p for p in WIKI_DIR.rglob("*.md") if p.name not in ("index.md", "log.md", "lint-report.md")] def extract_wikilinks(content: str) -> list[str]: return list(set(re.findall(r'\[\[([^\]]+)\]\]', content))) def extract_frontmatter_type(content: str) -> str: match = re.search(r'^type:\s*(\S+)', content, re.MULTILINE) return match.group(1).strip('"\'') if match else "unknown" def page_id(path: Path) -> str: return path.relative_to(WIKI_DIR).as_posix().replace(".md", "") def load_cache() -> dict: if CACHE_FILE.exists(): try: return json.loads(CACHE_FILE.read_text()) except (json.JSONDecodeError, IOError): return {} return {} def save_cache(cache: dict): GRAPH_DIR.mkdir(parents=True, exist_ok=True) CACHE_FILE.write_text(json.dumps(cache, indent=2)) def build_nodes(pages: list[Path]) -> list[dict]: nodes = [] for p in pages: content = read_file(p) node_type = extract_frontmatter_type(content) title_match = re.search(r'^title:\s*"?([^"\n]+)"?', content, re.MULTILINE) label = title_match.group(1).strip() if title_match else p.stem nodes.append({ "id": page_id(p), "label": label, "type": node_type, "color": TYPE_COLORS.get(node_type, TYPE_COLORS["unknown"]), "path": str(p.relative_to(REPO_ROOT)), }) return nodes def build_extracted_edges(pages: list[Path]) -> list[dict]: """Pass 1: deterministic wikilink edges.""" # Build a map from stem (lower) -> page_id for resolution stem_map = {p.stem.lower(): page_id(p) for p in pages} edges = [] seen = set() for p in pages: content = read_file(p) src = page_id(p) for link in extract_wikilinks(content): target = stem_map.get(link.lower()) if target and target != src: key = (src, target) if key not in seen: seen.add(key) edges.append({ "from": src, "to": target, "type": "EXTRACTED", "color": EDGE_COLORS["EXTRACTED"], "confidence": 1.0, }) return edges def build_inferred_edges(pages: list[Path], existing_edges: list[dict], cache: dict) -> list[dict]: """Pass 2: API-inferred semantic relationships.""" new_edges = [] # Only process pages that changed since last run changed_pages = [] for p in pages: content = read_file(p) h = sha256(content) entry = cache.get(str(p)) if not isinstance(entry, dict) or entry.get("hash") != h: changed_pages.append(p) else: # Page unchanged: load its inferred edges from cache perfectly src = page_id(p) for rel in entry.get("edges", []): new_edges.append({ "from": src, "to": rel["to"], "type": rel.get("type", "INFERRED"), "title": rel.get("relationship", ""), "label": "", "color": EDGE_COLORS.get(rel.get("type", "INFERRED"), EDGE_COLORS["INFERRED"]), "confidence": float(rel.get("confidence", 0.7)), }) if not changed_pages: print(" no changed pages — skipping semantic inference") return [] print(f" inferring relationships for {len(changed_pages)} changed pages...") # Build a summary of existing nodes for context node_list = "\n".join(f"- {page_id(p)} ({extract_frontmatter_type(read_file(p))})" for p in pages) existing_edge_summary = "\n".join( f"- {e['from']} → {e['to']} (EXTRACTED)" for e in existing_edges[:30] ) for p in changed_pages: content = read_file(p)[:2000] # truncate for context efficiency src = page_id(p) prompt = f"""Analyze this wiki page and identify implicit semantic relationships to other pages in the wiki. Source page: {src} Content: {content} All available pages: {node_list} Already-extracted edges from this page: {existing_edge_summary} Return ONLY a JSON array of NEW relationships not already captured by explicit wikilinks: [ {{"to": "page-id", "relationship": "one-line description", "confidence": 0.0-1.0, "type": "INFERRED or AMBIGUOUS"}} ] Rules: - Only include pages from the available list above - Confidence >= 0.7 → INFERRED, < 0.7 → AMBIGUOUS - Do not repeat edges already in the extracted list - Return empty array [] if no new relationships found """ raw = call_llm(prompt, "LLM_MODEL_FAST", "claude-3-5-haiku-latest", max_tokens=1024) raw = raw.strip() raw = re.sub(r"^```(?:json)?\s*", "", raw) raw = re.sub(r"\s*```$", "", raw) try: inferred = json.loads(raw) valid_rels = [] for rel in inferred: if isinstance(rel, dict) and "to" in rel: new_edges.append({ "from": src, "to": rel["to"], "type": rel.get("type", "INFERRED"), "title": rel.get("relationship", ""), "label": "", "color": EDGE_COLORS.get(rel.get("type", "INFERRED"), EDGE_COLORS["INFERRED"]), "confidence": float(rel.get("confidence", 0.7)), }) valid_rels.append(rel) # Save properly to cache cache[str(p)] = { "hash": sha256(content), "edges": valid_rels } except (json.JSONDecodeError, TypeError, ValueError): pass return new_edges def detect_communities(nodes: list[dict], edges: list[dict]) -> dict[str, int]: """Assign community IDs to nodes using Louvain algorithm.""" if not HAS_NETWORKX: return {} G = nx.Graph() for n in nodes: G.add_node(n["id"]) for e in edges: G.add_edge(e["from"], e["to"]) if G.number_of_edges() == 0: return {} try: communities = nx_community.louvain_communities(G, seed=42) node_to_community = {} for i, comm in enumerate(communities): for node in comm: node_to_community[node] = i return node_to_community except Exception: return {} COMMUNITY_COLORS = [ "#E91E63", "#00BCD4", "#8BC34A", "#FF5722", "#673AB7", "#FFC107", "#009688", "#F44336", "#3F51B5", "#CDDC39", ] def render_html(nodes: list[dict], edges: list[dict]) -> str: """Generate self-contained vis.js HTML.""" nodes_json = json.dumps(nodes, indent=2) edges_json = json.dumps(edges, indent=2) legend_items = "".join( f'{t}' for t, color in TYPE_COLORS.items() if t != "unknown" ) return f"""