Auto-sync

2026-04-14 11:58:16 +08:00
parent abc9369d1f
commit be67293b60
20 changed files with 2246 additions and 9 deletions
--- a/tools/build_graph.py
+++ b/tools/build_graph.py
@@ -0,0 +1,454 @@
+#!/usr/bin/env python3
+"""
+Build the knowledge graph from the wiki.
+
+Usage:
+    python tools/build_graph.py               # full rebuild
+    python tools/build_graph.py --no-infer    # skip semantic inference (faster)
+    python tools/build_graph.py --open        # open graph.html in browser after build
+
+Outputs:
+    graph/graph.json    — node/edge data (cached by SHA256)
+    graph/graph.html    — interactive vis.js visualization
+
+Edge types:
+    EXTRACTED   — explicit [[wikilink]] in a page
+    INFERRED    — Claude-detected implicit relationship
+    AMBIGUOUS   — low-confidence inferred relationship
+"""
+
+import re
+import json
+import hashlib
+import argparse
+import webbrowser
+from pathlib import Path
+from datetime import date
+
+import os
+
+try:
+    import networkx as nx
+    from networkx.algorithms import community as nx_community
+    HAS_NETWORKX = True
+except ImportError:
+    HAS_NETWORKX = False
+    print("Warning: networkx not installed. Community detection disabled. Run: pip install networkx")
+
+REPO_ROOT = Path(__file__).parent.parent
+WIKI_DIR = REPO_ROOT / "wiki"
+GRAPH_DIR = REPO_ROOT / "graph"
+GRAPH_JSON = GRAPH_DIR / "graph.json"
+GRAPH_HTML = GRAPH_DIR / "graph.html"
+CACHE_FILE = GRAPH_DIR / ".cache.json"
+LOG_FILE = WIKI_DIR / "log.md"
+SCHEMA_FILE = REPO_ROOT / "CLAUDE.md"
+
+# Node type → color mapping
+TYPE_COLORS = {
+    "source": "#4CAF50",
+    "entity": "#2196F3",
+    "concept": "#FF9800",
+    "synthesis": "#9C27B0",
+    "unknown": "#9E9E9E",
+}
+
+EDGE_COLORS = {
+    "EXTRACTED": "#555555",
+    "INFERRED": "#FF5722",
+    "AMBIGUOUS": "#BDBDBD",
+}
+
+
+def read_file(path: Path) -> str:
+    return path.read_text(encoding="utf-8") if path.exists() else ""
+
+
+def call_llm(prompt: str, model_env: str, default_model: str, max_tokens: int = 4096) -> str:
+    try:
+        from litellm import completion
+    except ImportError:
+        print("Error: litellm not installed. Run: pip install litellm")
+        import sys
+        sys.exit(1)
+        
+    model = os.getenv(model_env, default_model)
+    response = completion(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=max_tokens
+    )
+    return response.choices[0].message.content
+
+
+def sha256(text: str) -> str:
+    return hashlib.sha256(text.encode()).hexdigest()
+
+
+def all_wiki_pages() -> list[Path]:
+    return [p for p in WIKI_DIR.rglob("*.md")
+            if p.name not in ("index.md", "log.md", "lint-report.md")]
+
+
+def extract_wikilinks(content: str) -> list[str]:
+    return list(set(re.findall(r'\[\[([^\]]+)\]\]', content)))
+
+
+def extract_frontmatter_type(content: str) -> str:
+    match = re.search(r'^type:\s*(\S+)', content, re.MULTILINE)
+    return match.group(1).strip('"\'') if match else "unknown"
+
+
+def page_id(path: Path) -> str:
+    return path.relative_to(WIKI_DIR).as_posix().replace(".md", "")
+
+
+def load_cache() -> dict:
+    if CACHE_FILE.exists():
+        try:
+            return json.loads(CACHE_FILE.read_text())
+        except (json.JSONDecodeError, IOError):
+            return {}
+    return {}
+
+
+def save_cache(cache: dict):
+    GRAPH_DIR.mkdir(parents=True, exist_ok=True)
+    CACHE_FILE.write_text(json.dumps(cache, indent=2))
+
+
+def build_nodes(pages: list[Path]) -> list[dict]:
+    nodes = []
+    for p in pages:
+        content = read_file(p)
+        node_type = extract_frontmatter_type(content)
+        title_match = re.search(r'^title:\s*"?([^"\n]+)"?', content, re.MULTILINE)
+        label = title_match.group(1).strip() if title_match else p.stem
+        nodes.append({
+            "id": page_id(p),
+            "label": label,
+            "type": node_type,
+            "color": TYPE_COLORS.get(node_type, TYPE_COLORS["unknown"]),
+            "path": str(p.relative_to(REPO_ROOT)),
+        })
+    return nodes
+
+
+def build_extracted_edges(pages: list[Path]) -> list[dict]:
+    """Pass 1: deterministic wikilink edges."""
+    # Build a map from stem (lower) -> page_id for resolution
+    stem_map = {p.stem.lower(): page_id(p) for p in pages}
+    edges = []
+    seen = set()
+    for p in pages:
+        content = read_file(p)
+        src = page_id(p)
+        for link in extract_wikilinks(content):
+            target = stem_map.get(link.lower())
+            if target and target != src:
+                key = (src, target)
+                if key not in seen:
+                    seen.add(key)
+                    edges.append({
+                        "from": src,
+                        "to": target,
+                        "type": "EXTRACTED",
+                        "color": EDGE_COLORS["EXTRACTED"],
+                        "confidence": 1.0,
+                    })
+    return edges
+
+
+def build_inferred_edges(pages: list[Path], existing_edges: list[dict], cache: dict) -> list[dict]:
+    """Pass 2: API-inferred semantic relationships."""
+    new_edges = []
+
+    # Only process pages that changed since last run
+    changed_pages = []
+    for p in pages:
+        content = read_file(p)
+        h = sha256(content)
+        entry = cache.get(str(p))
+        
+        if not isinstance(entry, dict) or entry.get("hash") != h:
+            changed_pages.append(p)
+        else:
+            # Page unchanged: load its inferred edges from cache perfectly
+            src = page_id(p)
+            for rel in entry.get("edges", []):
+                new_edges.append({
+                    "from": src,
+                    "to": rel["to"],
+                    "type": rel.get("type", "INFERRED"),
+                    "title": rel.get("relationship", ""),
+                    "label": "",
+                    "color": EDGE_COLORS.get(rel.get("type", "INFERRED"), EDGE_COLORS["INFERRED"]),
+                    "confidence": float(rel.get("confidence", 0.7)),
+                })
+
+    if not changed_pages:
+        print("  no changed pages — skipping semantic inference")
+        return []
+
+    print(f"  inferring relationships for {len(changed_pages)} changed pages...")
+
+    # Build a summary of existing nodes for context
+    node_list = "\n".join(f"- {page_id(p)} ({extract_frontmatter_type(read_file(p))})" for p in pages)
+    existing_edge_summary = "\n".join(
+        f"- {e['from']} → {e['to']} (EXTRACTED)" for e in existing_edges[:30]
+    )
+
+    for p in changed_pages:
+        content = read_file(p)[:2000]  # truncate for context efficiency
+        src = page_id(p)
+
+        prompt = f"""Analyze this wiki page and identify implicit semantic relationships to other pages in the wiki.
+
+Source page: {src}
+Content:
+{content}
+
+All available pages:
+{node_list}
+
+Already-extracted edges from this page:
+{existing_edge_summary}
+
+Return ONLY a JSON array of NEW relationships not already captured by explicit wikilinks:
+[
+  {{"to": "page-id", "relationship": "one-line description", "confidence": 0.0-1.0, "type": "INFERRED or AMBIGUOUS"}}
+]
+
+Rules:
+- Only include pages from the available list above
+- Confidence >= 0.7 → INFERRED, < 0.7 → AMBIGUOUS
+- Do not repeat edges already in the extracted list
+- Return empty array [] if no new relationships found
+"""
+        raw = call_llm(prompt, "LLM_MODEL_FAST", "claude-3-5-haiku-latest", max_tokens=1024)
+        raw = raw.strip()
+        raw = re.sub(r"^```(?:json)?\s*", "", raw)
+        raw = re.sub(r"\s*```$", "", raw)
+
+        try:
+            inferred = json.loads(raw)
+            valid_rels = []
+            for rel in inferred:
+                if isinstance(rel, dict) and "to" in rel:
+                    new_edges.append({
+                        "from": src,
+                        "to": rel["to"],
+                        "type": rel.get("type", "INFERRED"),
+                        "title": rel.get("relationship", ""),
+                        "label": "",
+                        "color": EDGE_COLORS.get(rel.get("type", "INFERRED"), EDGE_COLORS["INFERRED"]),
+                        "confidence": float(rel.get("confidence", 0.7)),
+                    })
+                    valid_rels.append(rel)
+            
+            # Save properly to cache
+            cache[str(p)] = {
+                "hash": sha256(content),
+                "edges": valid_rels
+            }
+        except (json.JSONDecodeError, TypeError, ValueError):
+            pass
+
+    return new_edges
+
+
+def detect_communities(nodes: list[dict], edges: list[dict]) -> dict[str, int]:
+    """Assign community IDs to nodes using Louvain algorithm."""
+    if not HAS_NETWORKX:
+        return {}
+
+    G = nx.Graph()
+    for n in nodes:
+        G.add_node(n["id"])
+    for e in edges:
+        G.add_edge(e["from"], e["to"])
+
+    if G.number_of_edges() == 0:
+        return {}
+
+    try:
+        communities = nx_community.louvain_communities(G, seed=42)
+        node_to_community = {}
+        for i, comm in enumerate(communities):
+            for node in comm:
+                node_to_community[node] = i
+        return node_to_community
+    except Exception:
+        return {}
+
+
+COMMUNITY_COLORS = [
+    "#E91E63", "#00BCD4", "#8BC34A", "#FF5722", "#673AB7",
+    "#FFC107", "#009688", "#F44336", "#3F51B5", "#CDDC39",
+]
+
+
+def render_html(nodes: list[dict], edges: list[dict]) -> str:
+    """Generate self-contained vis.js HTML."""
+    nodes_json = json.dumps(nodes, indent=2)
+    edges_json = json.dumps(edges, indent=2)
+
+    legend_items = "".join(
+        f'<span style="background:{color};padding:3px 8px;margin:2px;border-radius:3px;font-size:12px">{t}</span>'
+        for t, color in TYPE_COLORS.items() if t != "unknown"
+    )
+
+    return f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<title>LLM Wiki — Knowledge Graph</title>
+<script src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
+<style>
+  body {{ margin: 0; background: #1a1a2e; font-family: sans-serif; color: #eee; }}
+  #graph {{ width: 100vw; height: 100vh; }}
+  #controls {{
+    position: fixed; top: 10px; left: 10px; background: rgba(0,0,0,0.7);
+    padding: 12px; border-radius: 8px; z-index: 10; max-width: 260px;
+  }}
+  #controls h3 {{ margin: 0 0 8px; font-size: 14px; }}
+  #search {{ width: 100%; padding: 4px; margin-bottom: 8px; background: #333; color: #eee; border: 1px solid #555; border-radius: 4px; }}
+  #info {{
+    position: fixed; bottom: 10px; left: 10px; background: rgba(0,0,0,0.8);
+    padding: 12px; border-radius: 8px; z-index: 10; max-width: 320px;
+    display: none;
+  }}
+  #stats {{ position: fixed; top: 10px; right: 10px; background: rgba(0,0,0,0.7); padding: 10px; border-radius: 8px; font-size: 12px; }}
+</style>
+</head>
+<body>
+<div id="controls">
+  <h3>LLM Wiki Graph</h3>
+  <input id="search" type="text" placeholder="Search nodes..." oninput="searchNodes(this.value)">
+  <div>{legend_items}</div>
+  <div style="margin-top:8px;font-size:11px;color:#aaa">
+    <span style="background:#555;padding:2px 6px;border-radius:3px;margin-right:4px">──</span> Explicit link<br>
+    <span style="background:#FF5722;padding:2px 6px;border-radius:3px;margin-right:4px">──</span> Inferred
+  </div>
+</div>
+<div id="graph"></div>
+<div id="info">
+  <b id="info-title"></b><br>
+  <span id="info-type" style="font-size:12px;color:#aaa"></span><br>
+  <span id="info-path" style="font-size:11px;color:#666"></span>
+</div>
+<div id="stats"></div>
+<script>
+const nodes = new vis.DataSet({nodes_json});
+const edges = new vis.DataSet({edges_json});
+
+const container = document.getElementById("graph");
+const network = new vis.Network(container, {{ nodes, edges }}, {{
+  nodes: {{
+    shape: "dot",
+    size: 12,
+    font: {{ color: "#eee", size: 13 }},
+    borderWidth: 2,
+  }},
+  edges: {{
+    width: 1.2,
+    smooth: {{ type: "continuous" }},
+    arrows: {{ to: {{ enabled: true, scaleFactor: 0.5 }} }},
+  }},
+  physics: {{
+    stabilization: {{ iterations: 150 }},
+    barnesHut: {{ gravitationalConstant: -8000, springLength: 120 }},
+  }},
+  interaction: {{ hover: true, tooltipDelay: 200 }},
+}});
+
+network.on("click", params => {{
+  if (params.nodes.length > 0) {{
+    const node = nodes.get(params.nodes[0]);
+    document.getElementById("info").style.display = "block";
+    document.getElementById("info-title").textContent = node.label;
+    document.getElementById("info-type").textContent = node.type;
+    document.getElementById("info-path").textContent = node.path;
+  }} else {{
+    document.getElementById("info").style.display = "none";
+  }}
+}});
+
+document.getElementById("stats").textContent =
+  `${{nodes.length}} nodes · ${{edges.length}} edges`;
+
+function searchNodes(q) {{
+  const lower = q.toLowerCase();
+  nodes.forEach(n => {{
+    nodes.update({{ id: n.id, opacity: (!q || n.label.toLowerCase().includes(lower)) ? 1 : 0.15 }});
+  }});
+}}
+</script>
+</body>
+</html>"""
+
+
+def append_log(entry: str):
+    log_path = WIKI_DIR / "log.md"
+    existing = read_file(log_path)
+    log_path.write_text(entry.strip() + "\n\n" + existing, encoding="utf-8")
+
+
+def build_graph(infer: bool = True, open_browser: bool = False):
+    pages = all_wiki_pages()
+    today = date.today().isoformat()
+
+    if not pages:
+        print("Wiki is empty. Ingest some sources first.")
+        return
+
+    print(f"Building graph from {len(pages)} wiki pages...")
+    GRAPH_DIR.mkdir(parents=True, exist_ok=True)
+
+    cache = load_cache()
+
+    # Pass 1: extracted edges
+    print("  Pass 1: extracting wikilinks...")
+    nodes = build_nodes(pages)
+    edges = build_extracted_edges(pages)
+    print(f"  → {len(edges)} extracted edges")
+
+    # Pass 2: inferred edges
+    if infer:
+        print("  Pass 2: inferring semantic relationships...")
+        inferred = build_inferred_edges(pages, edges, cache)
+        edges.extend(inferred)
+        print(f"  → {len(inferred)} inferred edges")
+        save_cache(cache)
+
+    # Community detection
+    print("  Running Louvain community detection...")
+    communities = detect_communities(nodes, edges)
+    for node in nodes:
+        comm_id = communities.get(node["id"], -1)
+        if comm_id >= 0:
+            node["color"] = COMMUNITY_COLORS[comm_id % len(COMMUNITY_COLORS)]
+        node["group"] = comm_id
+
+    # Save graph.json
+    graph_data = {"nodes": nodes, "edges": edges, "built": today}
+    GRAPH_JSON.write_text(json.dumps(graph_data, indent=2))
+    print(f"  saved: graph/graph.json  ({len(nodes)} nodes, {len(edges)} edges)")
+
+    # Save graph.html
+    html = render_html(nodes, edges)
+    GRAPH_HTML.write_text(html)
+    print(f"  saved: graph/graph.html")
+
+    append_log(f"## [{today}] graph | Knowledge graph rebuilt\n\n{len(nodes)} nodes, {len(edges)} edges ({len([e for e in edges if e['type']=='EXTRACTED'])} extracted, {len([e for e in edges if e['type']=='INFERRED'])} inferred).")
+
+    if open_browser:
+        webbrowser.open(f"file://{GRAPH_HTML.resolve()}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Build LLM Wiki knowledge graph")
+    parser.add_argument("--no-infer", action="store_true", help="Skip semantic inference (faster)")
+    parser.add_argument("--open", action="store_true", help="Open graph.html in browser")
+    args = parser.parse_args()
+    build_graph(infer=not args.no_infer, open_browser=args.open)
--- a/tools/heal.py
+++ b/tools/heal.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+"""
+Graph Self-Healing Tool
+
+Automatically retrieves "Missing Entity Pages" from the wiki and generates 
+comprehensive definition pages for them using the LLM. 
+It resolves broken entity links by scanning existing contexts where the entity is referenced.
+
+Usage:
+    python tools/heal.py
+"""
+
+import os
+import sys
+from pathlib import Path
+
+try:
+    from litellm import completion
+except ImportError:
+    print("Error: litellm not installed. Run: pip install litellm")
+    sys.exit(1)
+
+# Ensure tools can be imported
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from tools.lint import find_missing_entities, all_wiki_pages
+
+REPO_ROOT = Path(__file__).parent.parent
+WIKI_DIR = REPO_ROOT / "wiki"
+ENTITIES_DIR = WIKI_DIR / "entities"
+
+def call_llm(prompt: str, max_tokens: int = 1500) -> str:
+    # Use litellm standard environment variables
+    # e.g., GEMINI_API_KEY, ANTHROPIC_API_KEY, OPENAI_API_KEY
+    model = os.getenv("LLM_MODEL", "claude-3-5-haiku-latest") # default to fast model
+    
+    response = completion(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=max_tokens
+    )
+    return response.choices[0].message.content
+
+def search_sources(entity: str, pages: list[Path]) -> list[Path]:
+    """Find up to 15 pages where this entity is mentioned natively."""
+    sources = []
+    for p in pages:
+        if "entities" not in str(p.parent) and "concepts" not in str(p.parent):
+            content = p.read_text(encoding="utf-8")
+            if entity.lower() in content.lower():
+                sources.append(p)
+    return sources[:15]
+
+def heal_missing_entities():
+    pages = all_wiki_pages()
+    missing_entities = find_missing_entities(pages)
+    
+    if not missing_entities:
+        print("Graph is fully connected. No missing entities found!")
+        return
+
+    ENTITIES_DIR.mkdir(exist_ok=True, parents=True)
+    print(f"Found {len(missing_entities)} missing entity nodes. Commencing auto-heal...")
+    
+    for entity in missing_entities:
+        print(f"Healing entity page for: {entity}")
+        sources = search_sources(entity, pages)
+        
+        context = ""
+        for s in sources:
+            context += f"\n\n### {s.name}\n{s.read_text(encoding='utf-8')[:800]}"
+        
+        prompt = f"""You are filling a data gap in the Personal LLM Wiki. 
+Create an Entity definition page for "{entity}".
+
+Here is how the entity appears in the current sources:
+{context}
+
+Format:
+---
+title: "{entity}"
+type: entity
+tags: []
+sources: {[s.name for s in sources]}
+---
+
+# {entity}
+
+Write a comprehensive paragraph defining what `{entity}` means in the context of this wiki, its main significance, and any actions or associations related to it.
+"""
+        try:
+            result = call_llm(prompt)
+            out_path = ENTITIES_DIR / f"{entity}.md"
+            out_path.write_text(result, encoding="utf-8")
+            print(f" -> Saved to {out_path.relative_to(REPO_ROOT)}")
+        except Exception as e:
+            print(f" [!] Failed to generate {entity}: {e}")
+
+if __name__ == "__main__":
+    heal_missing_entities()
--- a/tools/ingest.py
+++ b/tools/ingest.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+"""
+Ingest a source document into the LLM Wiki.
+
+Usage:
+    python tools/ingest.py <path-to-source>
+    python tools/ingest.py raw/articles/my-article.md
+
+The LLM reads the source, extracts knowledge, and updates the wiki:
+  - Creates wiki/sources/<slug>.md
+  - Updates wiki/index.md
+  - Updates wiki/overview.md (if warranted)
+  - Creates/updates entity and concept pages
+  - Appends to wiki/log.md
+  - Flags contradictions
+"""
+
+import os
+import sys
+import json
+import hashlib
+import re
+from pathlib import Path
+from datetime import date
+
+import os
+
+REPO_ROOT = Path(__file__).parent.parent
+WIKI_DIR = REPO_ROOT / "wiki"
+LOG_FILE = WIKI_DIR / "log.md"
+INDEX_FILE = WIKI_DIR / "index.md"
+OVERVIEW_FILE = WIKI_DIR / "overview.md"
+SCHEMA_FILE = REPO_ROOT / "CLAUDE.md"
+
+
+def sha256(text: str) -> str:
+    return hashlib.sha256(text.encode()).hexdigest()[:16]
+
+
+def read_file(path: Path) -> str:
+    return path.read_text(encoding="utf-8") if path.exists() else ""
+
+
+def call_llm(prompt: str, max_tokens: int = 8192) -> str:
+    try:
+        from litellm import completion
+    except ImportError:
+        print("Error: litellm not installed. Run: pip install litellm")
+        sys.exit(1)
+        
+    model = os.getenv("LLM_MODEL", "claude-3-5-sonnet-latest")
+    response = completion(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=max_tokens
+    )
+    return response.choices[0].message.content
+
+
+def write_file(path: Path, content: str):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(content, encoding="utf-8")
+    print(f"  wrote: {path.relative_to(REPO_ROOT)}")
+
+
+def build_wiki_context() -> str:
+    parts = []
+    if INDEX_FILE.exists():
+        parts.append(f"## wiki/index.md\n{read_file(INDEX_FILE)}")
+    if OVERVIEW_FILE.exists():
+        parts.append(f"## wiki/overview.md\n{read_file(OVERVIEW_FILE)}")
+    # Include a few recent source pages for contradiction checking
+    sources_dir = WIKI_DIR / "sources"
+    if sources_dir.exists():
+        recent = sorted(sources_dir.glob("*.md"), key=lambda p: p.stat().st_mtime, reverse=True)[:5]
+        for p in recent:
+            parts.append(f"## {p.relative_to(REPO_ROOT)}\n{p.read_text()}")
+    return "\n\n---\n\n".join(parts)
+
+
+def parse_json_from_response(text: str) -> dict:
+    # Strip markdown code fences if present
+    text = re.sub(r"^```(?:json)?\s*", "", text.strip())
+    text = re.sub(r"\s*```$", "", text.strip())
+    # Find the outermost JSON object
+    match = re.search(r"\{[\s\S]*\}", text)
+    if not match:
+        raise ValueError("No JSON object found in response")
+    return json.loads(match.group())
+
+
+def update_index(new_entry: str, section: str = "Sources"):
+    content = read_file(INDEX_FILE)
+    if not content:
+        content = "# Wiki Index\n\n## Overview\n- [Overview](overview.md) — living synthesis\n\n## Sources\n\n## Entities\n\n## Concepts\n\n## Syntheses\n"
+    section_header = f"## {section}"
+    if section_header in content:
+        content = content.replace(section_header + "\n", section_header + "\n" + new_entry + "\n")
+    else:
+        content += f"\n{section_header}\n{new_entry}\n"
+    write_file(INDEX_FILE, content)
+
+
+def append_log(entry: str):
+    existing = read_file(LOG_FILE)
+    write_file(LOG_FILE, entry.strip() + "\n\n" + existing)
+
+
+def ingest(source_path: str):
+    source = Path(source_path)
+    if not source.exists():
+        print(f"Error: file not found: {source_path}")
+        sys.exit(1)
+
+    source_content = source.read_text(encoding="utf-8")
+    source_hash = sha256(source_content)
+    today = date.today().isoformat()
+
+    print(f"\nIngesting: {source.name}  (hash: {source_hash})")
+
+    wiki_context = build_wiki_context()
+    schema = read_file(SCHEMA_FILE)
+
+    schema = read_file(SCHEMA_FILE)
+
+    prompt = f"""You are maintaining an LLM Wiki. Process this source document and integrate its knowledge into the wiki.
+
+Schema and conventions:
+{schema}
+
+Current wiki state (index + recent pages):
+{wiki_context if wiki_context else "(wiki is empty — this is the first source)"}
+
+New source to ingest (file: {source.relative_to(REPO_ROOT) if source.is_relative_to(REPO_ROOT) else source.name}):
+=== SOURCE START ===
+{source_content}
+=== SOURCE END ===
+
+Today's date: {today}
+
+Return ONLY a valid JSON object with these fields (no markdown fences, no prose outside the JSON):
+{{
+  "title": "Human-readable title for this source",
+  "slug": "kebab-case-slug-for-filename",
+  "source_page": "full markdown content for wiki/sources/<slug>.md — use the source page format from the schema",
+  "index_entry": "- [Title](sources/slug.md) — one-line summary",
+  "overview_update": "full updated content for wiki/overview.md, or null if no update needed",
+  "entity_pages": [
+    {{"path": "entities/EntityName.md", "content": "full markdown content"}}
+  ],
+  "concept_pages": [
+    {{"path": "concepts/ConceptName.md", "content": "full markdown content"}}
+  ],
+  "contradictions": ["describe any contradiction with existing wiki content, or empty list"],
+  "log_entry": "## [{today}] ingest | <title>\\n\\nAdded source. Key claims: ..."
+}}
+"""
+
+    print(f"  calling API (model: ...)")
+    raw = call_llm(prompt, max_tokens=8192)
+    try:
+        data = parse_json_from_response(raw)
+    except (ValueError, json.JSONDecodeError) as e:
+        print(f"Error parsing API response: {e}")
+        print("Raw response saved to /tmp/ingest_debug.txt")
+        Path("/tmp/ingest_debug.txt").write_text(raw)
+        sys.exit(1)
+
+    # Write source page
+    slug = data["slug"]
+    write_file(WIKI_DIR / "sources" / f"{slug}.md", data["source_page"])
+
+    # Write entity pages
+    for page in data.get("entity_pages", []):
+        write_file(WIKI_DIR / page["path"], page["content"])
+
+    # Write concept pages
+    for page in data.get("concept_pages", []):
+        write_file(WIKI_DIR / page["path"], page["content"])
+
+    # Update overview
+    if data.get("overview_update"):
+        write_file(OVERVIEW_FILE, data["overview_update"])
+
+    # Update index
+    update_index(data["index_entry"], section="Sources")
+
+    # Append log
+    append_log(data["log_entry"])
+
+    # Report contradictions
+    contradictions = data.get("contradictions", [])
+    if contradictions:
+        print("\n  ⚠️  Contradictions detected:")
+        for c in contradictions:
+            print(f"     - {c}")
+
+    print(f"\nDone. Ingested: {data['title']}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python tools/ingest.py <path-to-source> [path2 ...] [dir1 ...]")
+        sys.exit(1)
+        
+    paths_to_process = []
+    for arg in sys.argv[1:]:
+        p = Path(arg)
+        if p.is_file() and p.suffix == ".md":
+            paths_to_process.append(p)
+        elif p.is_dir():
+            for f in p.rglob("*.md"):
+                if f.is_file():
+                    paths_to_process.append(f)
+        else:
+            import glob
+            for f in glob.glob(arg, recursive=True):
+                g_p = Path(f)
+                if g_p.is_file() and g_p.suffix == ".md":
+                    paths_to_process.append(g_p)
+                    
+    # Deduplicate while preserving order
+    unique_paths = []
+    seen = set()
+    for p in paths_to_process:
+        abs_p = p.resolve()
+        if abs_p not in seen:
+            seen.add(abs_p)
+            unique_paths.append(p)
+
+    if not unique_paths:
+        print("Error: no markdown files found to ingest.")
+        sys.exit(1)
+        
+    if len(unique_paths) > 1:
+        print(f"Batch mode: found {len(unique_paths)} files to ingest.")
+        
+    for p in unique_paths:
+        ingest(str(p))
--- a/tools/lint.py
+++ b/tools/lint.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+"""
+Lint the LLM Wiki for health issues.
+
+Usage:
+    python tools/lint.py
+    python tools/lint.py --save          # save lint report to wiki/lint-report.md
+
+Checks:
+  - Orphan pages (no inbound wikilinks from other pages)
+  - Broken wikilinks (pointing to pages that don't exist)
+  - Missing entity pages (entities mentioned in 3+ pages but no page)
+  - Contradictions between pages
+  - Data gaps and suggested new sources
+"""
+
+import re
+import sys
+import argparse
+from pathlib import Path
+from collections import defaultdict
+from datetime import date
+
+import os
+
+REPO_ROOT = Path(__file__).parent.parent
+WIKI_DIR = REPO_ROOT / "wiki"
+LOG_FILE = WIKI_DIR / "log.md"
+SCHEMA_FILE = REPO_ROOT / "CLAUDE.md"
+
+
+def read_file(path: Path) -> str:
+    return path.read_text(encoding="utf-8") if path.exists() else ""
+
+
+def call_llm(prompt: str, model_env: str, default_model: str, max_tokens: int = 4096) -> str:
+    try:
+        from litellm import completion
+    except ImportError:
+        print("Error: litellm not installed. Run: pip install litellm")
+        sys.exit(1)
+        
+    model = os.getenv(model_env, default_model)
+    response = completion(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=max_tokens
+    )
+    return response.choices[0].message.content
+
+
+def all_wiki_pages() -> list[Path]:
+    return [p for p in WIKI_DIR.rglob("*.md")
+            if p.name not in ("index.md", "log.md", "lint-report.md")]
+
+
+def extract_wikilinks(content: str) -> list[str]:
+    return re.findall(r'\[\[([^\]]+)\]\]', content)
+
+
+def page_name_to_path(name: str) -> list[Path]:
+    """Try to resolve a [[WikiLink]] to a file path."""
+    candidates = []
+    for p in all_wiki_pages():
+        if p.stem.lower() == name.lower() or p.stem == name:
+            candidates.append(p)
+    return candidates
+
+
+def find_orphans(pages: list[Path]) -> list[Path]:
+    inbound = defaultdict(int)
+    for p in pages:
+        content = read_file(p)
+        for link in extract_wikilinks(content):
+            resolved = page_name_to_path(link)
+            for r in resolved:
+                inbound[r] += 1
+    return [p for p in pages if inbound[p] == 0 and p != WIKI_DIR / "overview.md"]
+
+
+def find_broken_links(pages: list[Path]) -> list[tuple[Path, str]]:
+    broken = []
+    for p in pages:
+        content = read_file(p)
+        for link in extract_wikilinks(content):
+            if not page_name_to_path(link):
+                broken.append((p, link))
+    return broken
+
+
+def find_missing_entities(pages: list[Path]) -> list[str]:
+    """Find entity-like names mentioned in 3+ pages but lacking their own page."""
+    mention_counts: dict[str, int] = defaultdict(int)
+    existing_pages = {p.stem.lower() for p in pages}
+    for p in pages:
+        content = read_file(p)
+        links = extract_wikilinks(content)
+        for link in links:
+            if link.lower() not in existing_pages:
+                mention_counts[link] += 1
+    return [name for name, count in mention_counts.items() if count >= 3]
+
+
+def run_lint():
+    pages = all_wiki_pages()
+    today = date.today().isoformat()
+
+    if not pages:
+        print("Wiki is empty. Nothing to lint.")
+        return ""
+
+    print(f"Linting {len(pages)} wiki pages...")
+
+    # Deterministic checks
+    orphans = find_orphans(pages)
+    broken = find_broken_links(pages)
+    missing_entities = find_missing_entities(pages)
+
+    print(f"  orphans: {len(orphans)}")
+    print(f"  broken links: {len(broken)}")
+    print(f"  missing entity pages: {len(missing_entities)}")
+
+    # Build context for semantic checks (contradictions, gaps)
+    # Use a sample of pages to stay within context limits
+    sample = pages[:20]
+    pages_context = ""
+    for p in sample:
+        rel = p.relative_to(REPO_ROOT)
+        pages_context += f"\n\n### {rel}\n{read_file(p)[:1500]}"  # truncate long pages
+
+    print("  running semantic lint via API...")
+    prompt = f"""You are linting an LLM Wiki. Review the pages below and identify:
+1. Contradictions between pages (claims that conflict)
+2. Stale content (summaries that newer sources have superseded)
+3. Data gaps (important questions the wiki can't answer — suggest specific sources to find)
+4. Concepts mentioned but lacking depth
+
+Wiki pages (sample of {len(sample)} pages):
+{pages_context}
+
+Return a markdown lint report with these sections:
+## Contradictions
+## Stale Content
+## Data Gaps & Suggested Sources
+## Concepts Needing More Depth
+
+Be specific — name the exact pages and claims involved.
+"""
+    semantic_report = call_llm(prompt, "LLM_MODEL", "claude-3-5-sonnet-latest", max_tokens=3000)
+
+    # Compose full report
+    report_lines = [
+        f"# Wiki Lint Report — {today}",
+        "",
+        f"Scanned {len(pages)} pages.",
+        "",
+        "## Structural Issues",
+        "",
+    ]
+
+    if orphans:
+        report_lines.append("### Orphan Pages (no inbound links)")
+        for p in orphans:
+            report_lines.append(f"- `{p.relative_to(REPO_ROOT)}`")
+        report_lines.append("")
+
+    if broken:
+        report_lines.append("### Broken Wikilinks")
+        for page, link in broken:
+            report_lines.append(f"- `{page.relative_to(REPO_ROOT)}` links to `[[{link}]]` — not found")
+        report_lines.append("")
+
+    if missing_entities:
+        report_lines.append("### Missing Entity Pages (mentioned 3+ times but no page)")
+        for name in missing_entities:
+            report_lines.append(f"- `[[{name}]]`")
+        report_lines.append("")
+
+    if not orphans and not broken and not missing_entities:
+        report_lines.append("No structural issues found.")
+        report_lines.append("")
+
+    report_lines.append("---")
+    report_lines.append("")
+    report_lines.append(semantic_report)
+
+    report = "\n".join(report_lines)
+    print("\n" + report)
+    return report
+
+
+def append_log(entry: str):
+    existing = read_file(LOG_FILE)
+    LOG_FILE.write_text(entry.strip() + "\n\n" + existing, encoding="utf-8")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Lint the LLM Wiki")
+    parser.add_argument("--save", action="store_true", help="Save lint report to wiki/lint-report.md")
+    args = parser.parse_args()
+
+    report = run_lint()
+
+    if args.save and report:
+        report_path = WIKI_DIR / "lint-report.md"
+        report_path.write_text(report, encoding="utf-8")
+        print(f"\nSaved: {report_path.relative_to(REPO_ROOT)}")
+
+    today = date.today().isoformat()
+    append_log(f"## [{today}] lint | Wiki health check\n\nRan lint. See lint-report.md for details.")
--- a/tools/query.py
+++ b/tools/query.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+"""
+Query the LLM Wiki.
+
+Usage:
+    python tools/query.py "What are the main themes across all sources?"
+    python tools/query.py "How does ConceptA relate to ConceptB?" --save
+    python tools/query.py "Summarize everything about EntityName" --save synthesis/my-analysis.md
+
+Flags:
+    --save              Save the answer back into the wiki (prompts for filename)
+    --save <path>       Save to a specific wiki path
+"""
+
+import sys
+import re
+import json
+import argparse
+from pathlib import Path
+from datetime import date
+
+import os
+
+REPO_ROOT = Path(__file__).parent.parent
+WIKI_DIR = REPO_ROOT / "wiki"
+INDEX_FILE = WIKI_DIR / "index.md"
+LOG_FILE = WIKI_DIR / "log.md"
+SCHEMA_FILE = REPO_ROOT / "CLAUDE.md"
+
+
+def read_file(path: Path) -> str:
+    return path.read_text(encoding="utf-8") if path.exists() else ""
+
+
+def write_file(path: Path, content: str):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(content, encoding="utf-8")
+    print(f"  saved: {path.relative_to(REPO_ROOT)}")
+
+
+def call_llm(prompt: str, model_env: str, default_model: str, max_tokens: int = 4096) -> str:
+    try:
+        from litellm import completion
+    except ImportError:
+        print("Error: litellm not installed. Run: pip install litellm")
+        sys.exit(1)
+        
+    model = os.getenv(model_env, default_model)
+    response = completion(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=max_tokens
+    )
+    return response.choices[0].message.content
+
+
+def find_relevant_pages(question: str, index_content: str) -> list[Path]:
+    """Extract linked pages from index that seem relevant to the question."""
+    # Pull all [[links]] and markdown links from index
+    md_links = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', index_content)
+    question_lower = question.lower()
+    relevant = []
+    
+    for title, href in md_links:
+        title_lower = title.lower()
+        match = False
+        
+        # 1. English/Space-separated: check words > 3 chars
+        if any(word in question_lower for word in title_lower.split() if len(word) > 3):
+            match = True
+        # 2. Exact substring match for the whole title (useful for short CJK titles, e.g. len=2)
+        elif len(title_lower) >= 2 and title_lower in question_lower:
+            match = True
+        # 3. CJK chunks: find contiguous non-ASCII characters (len >= 2) and check if in question
+        elif any(chunk in question_lower for chunk in re.findall(r'[^\x00-\x7F]{2,}', title_lower)):
+            match = True
+            
+        if match:
+            p = WIKI_DIR / href
+            if p.exists() and p not in relevant:
+                relevant.append(p)
+                
+    # Always include overview
+    overview = WIKI_DIR / "overview.md"
+    if overview.exists() and overview not in relevant:
+        relevant.insert(0, overview)
+    return relevant[:12]  # cap to avoid context overflow
+
+
+def append_log(entry: str):
+    existing = read_file(LOG_FILE)
+    LOG_FILE.write_text(entry.strip() + "\n\n" + existing, encoding="utf-8")
+
+
+def query(question: str, save_path: str | None = None):
+    today = date.today().isoformat()
+
+    # Step 1: Read index
+    index_content = read_file(INDEX_FILE)
+    if not index_content:
+        print("Wiki is empty. Ingest some sources first with: python tools/ingest.py <source>")
+        sys.exit(1)
+
+    # Step 2: Find relevant pages
+    relevant_pages = find_relevant_pages(question, index_content)
+
+    # If no keyword match, ask Claude to identify relevant pages from the index
+    if not relevant_pages or len(relevant_pages) <= 1:
+        print("  selecting relevant pages via API...")
+        prompt = f"Given this wiki index:\n\n{index_content}\n\nWhich pages are most relevant to answering: \"{question}\"\n\nReturn ONLY a JSON array of relative file paths (as listed in the index), e.g. [\"sources/foo.md\", \"concepts/Bar.md\"]. Maximum 10 pages."
+        raw = call_llm(prompt, "LLM_MODEL_FAST", "claude-3-5-haiku-latest", max_tokens=512)
+        raw = raw.strip()
+        raw = re.sub(r"^```(?:json)?\s*", "", raw)
+        raw = re.sub(r"\s*```$", "", raw)
+        try:
+            paths = json.loads(raw)
+            relevant_pages = [WIKI_DIR / p for p in paths if (WIKI_DIR / p).exists()]
+        except (json.JSONDecodeError, TypeError):
+            pass
+
+    # Step 3: Read relevant pages
+    pages_context = ""
+    for p in relevant_pages:
+        rel = p.relative_to(REPO_ROOT)
+        pages_context += f"\n\n### {rel}\n{p.read_text(encoding='utf-8')}"
+
+    if not pages_context:
+        pages_context = f"\n\n### wiki/index.md\n{index_content}"
+
+    schema = read_file(SCHEMA_FILE)
+
+    # Step 4: Synthesize answer
+    print(f"  synthesizing answer from {len(relevant_pages)} pages...")
+    prompt = f"""You are querying an LLM Wiki to answer a question. Use the wiki pages below to synthesize a thorough answer. Cite sources using [[PageName]] wikilink syntax.
+
+Schema:
+{schema}
+
+Wiki pages:
+{pages_context}
+
+Question: {question}
+
+Write a well-structured markdown answer with headers, bullets, and [[wikilink]] citations. At the end, add a ## Sources section listing the pages you drew from.
+"""
+    answer = call_llm(prompt, "LLM_MODEL", "claude-3-5-sonnet-latest", max_tokens=4096)
+    print("\n" + "=" * 60)
+    print(answer)
+    print("=" * 60)
+
+    # Step 5: Optionally save answer
+    if save_path is not None:
+        if save_path == "":
+            # Prompt for filename
+            slug = input("\nSave as (slug, e.g. 'my-analysis'): ").strip()
+            if not slug:
+                print("Skipping save.")
+                return
+            save_path = f"syntheses/{slug}.md"
+
+        full_save_path = WIKI_DIR / save_path
+        frontmatter = f"""---
+title: "{question[:80]}"
+type: synthesis
+tags: []
+sources: []
+last_updated: {today}
+---
+
+"""
+        write_file(full_save_path, frontmatter + answer)
+
+        # Update index
+        index_content = read_file(INDEX_FILE)
+        entry = f"- [{question[:60]}]({save_path}) — synthesis"
+        if "## Syntheses" in index_content:
+            index_content = index_content.replace("## Syntheses\n", f"## Syntheses\n{entry}\n")
+            INDEX_FILE.write_text(index_content, encoding="utf-8")
+        print(f"  indexed: {save_path}")
+
+    # Append to log
+    append_log(f"## [{today}] query | {question[:80]}\n\nSynthesized answer from {len(relevant_pages)} pages." +
+               (f" Saved to {save_path}." if save_path else ""))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Query the LLM Wiki")
+    parser.add_argument("question", help="Question to ask the wiki")
+    parser.add_argument("--save", nargs="?", const="", default=None,
+                        help="Save answer to wiki (optionally specify path)")
+    args = parser.parse_args()
+    query(args.question, args.save)