#!/usr/bin/env python """ OpenClaw Session Sync Script Scans local or remote (via SSH) agent session directories, parses JSONL files, and pushes structured JSON to the Django API. Usage: # Local session sync python sync_sessions.py --remote-url http://macmini:8000/api/sessions/bulk_upsert/ # Remote session sync via SSH (single source, run from macmini) python sync_sessions.py --sync-ssh ubuntu1 \ --sync-source-path /home/shenwei/.openclaw \ --remote-url http://192.168.3.45:8765/api/sessions/bulk_upsert/ # Remote session sync via SSH (all three nodes) python sync_sessions.py --sync-ssh macmini \ --sync-source-path /Users/weishen/.openclaw \ --remote-url http://192.168.3.45:8765/api/sessions/bulk_upsert/ # Cron job sync (existing) python sync_sessions.py --cron \ --remote-url http://macmini:8000/api/cron/bulk_upsert/ \ --cron-ssh macmini \ --cron-jobs-path /Users/weishen/openclaw/cron/jobs.json \ --cron-runs-path /Users/weishen/openclaw/cron/runs/ """ import argparse import json import os import subprocess import sys import urllib.error import urllib.request from pathlib import Path # ───────────────────────────────────────────────────────────────── # Configuration # ───────────────────────────────────────────────────────────────── SESSIONS_DIR_NAME = "sessions" STATE_FILE = ".sync_state" DELETED_SUFFIX = ".deleted." CRON_STATE_FILE = ".sync_cron_state" SESSION_STATE_FILE = ".sync_session_ssh_state" # ───────────────────────────────────────────────────────────────── # SSH Helper # ───────────────────────────────────────────────────────────────── def ssh_read_file(host, remote_path): """Read a remote file via SSH and return content as string.""" result = subprocess.run( ["ssh", host, f"cat {remote_path}"], capture_output=True, text=True, timeout=60, ) if result.returncode != 0: raise RuntimeError(f"SSH read failed for {host}:{remote_path}: {result.stderr}") return result.stdout def ssh_list_dir(host, remote_dir): """List only directories (basenames) in a remote directory via SSH.""" # Use find to list directories only (type -d), then strip the remote_dir prefix result = subprocess.run( ["ssh", host, f"find {remote_dir} -maxdepth 1 -type d -name '*' | sort"], capture_output=True, text=True, timeout=30, ) if result.returncode != 0: return [] dirs = [] for f in result.stdout.strip().split("\n"): f = f.strip() if not f or f == remote_dir: continue # Strip remote_dir prefix basename = f[len(remote_dir):].lstrip("/") dirs.append(basename) return dirs def ssh_list_files(host, remote_dir, pattern="*"): """List regular files matching pattern via SSH. Returns basenames only.""" result = subprocess.run( ["ssh", host, f"ls -p {remote_dir}/{pattern}"], capture_output=True, text=True, timeout=30, ) if result.returncode != 0: return [] files = [] for f in result.stdout.strip().split("\n"): f = f.strip() if not f: continue # `ls -p` appends / to directories; filter them out if f.endswith("/"): continue # ls returns full paths; strip the remote_dir prefix if f.startswith(remote_dir): f = f[len(remote_dir):].lstrip("/") files.append(f) return files def ssh_stat_mtime_batch(host, file_list, sessions_dir): """ Batch-get mtimes for multiple files via a single SSH call. Passes file list on stdin to avoid command-line length limits. Returns {remote_full_path: mtime}. """ if not file_list: return {} # Build a newline-separated list of files to stat # Escape single quotes in paths by replacing ' -> '\'' (shell quoting) safe_list = "\n".join(f"{sessions_dir}/{fn}" for fn in file_list) # Remote Python one-liner reads paths from stdin, stats each, prints "path:mtime\n" remote_cmd = ( "python3 -c \"" "import sys,os; [print(f'{p}:{os.path.getmtime(p)}') for p in sys.stdin.read().splitlines()]" "\"" ) result = subprocess.run( ["ssh", host, remote_cmd], input=safe_list, capture_output=True, text=True, timeout=60, ) mtimes = {} for line in result.stdout.strip().split("\n"): line = line.strip() if not line or ":" not in line: continue path, mtime_str = line.rsplit(":", 1) try: mtimes[path] = int(float(mtime_str)) except ValueError: continue return mtimes # ───────────────────────────────────────────────────────────────── # JSONL Parser (Session mode) # ───────────────────────────────────────────────────────────────── def parse_jsonl_content(content): """ Parse JSONL content string and return (sessions, messages, tool_calls). This is the core parsing logic shared by both local and SSH modes. """ sessions = [] messages = [] tool_calls = [] current_model_provider = "" current_model_id = "" current_thinking_level = "" tool_results = {} events = [] for line in content.strip().split("\n"): line = line.strip() if not line: continue try: event = json.loads(line) events.append(event) except json.JSONDecodeError: continue if not events: return sessions, messages, tool_calls session_event = None for event in events: event_type = event.get("type", "") if event_type == "session": session_event = event break if not session_event: return sessions, messages, tool_calls session_id = session_event.get("id", "") session_timestamp = session_event.get("timestamp", "") session_cwd = session_event.get("cwd", "") session_version = events[-1].get("version", 0) if events else 0 timestamps = [] for event in events: ts = event.get("timestamp", "") if ts: timestamps.append(ts) message_seq = 0 total_tokens = 0 total_cost = 0.0 message_count = 0 tool_call_count = 0 error_count = 0 for event in events: event_type = event.get("type", "") if event_type == "model_change": current_model_provider = event.get("provider", "") current_model_id = event.get("modelId", "") elif event_type == "thinking_level_change": current_thinking_level = event.get("thinkingLevel", "") elif event_type == "message": message_obj = event.get("message", {}) role = message_obj.get("role", "") msg_id = event.get("id", "") parent_id = event.get("parentId", "") msg_timestamp = event.get("timestamp", "") content_items = message_obj.get("content", []) text_parts = [] tc_list = [] for item in content_items: if isinstance(item, dict): if item.get("type") == "text": text_parts.append(item.get("text", "")) elif item.get("type") == "toolCall": tc_list.append(item) content_text = "\n".join(text_parts) msg_data = { "session_id": session_id, "message_id": msg_id, "parent_id": parent_id or "", "seq": message_seq, "role": role or "", "content_text": content_text, "raw_content": content_items if content_items else [], "raw_message": message_obj.get("content", []), "timestamp": msg_timestamp, } if role == "assistant": usage = message_obj.get("usage", {}) msg_data.update({ "model": current_model_id, "provider": current_model_provider, "stop_reason": message_obj.get("stopReason", ""), "tokens_input": usage.get("inputTokens", 0), "tokens_output": usage.get("outputTokens", 0), "tokens_cache_read": usage.get("cacheReadInputTokens", 0), "tokens_cache_write": usage.get("cacheWriteInputTokens", 0), "tokens_total": usage.get("totalTokens", 0), }) total_tokens += usage.get("totalTokens", 0) if message_obj.get("cost"): cost_val = message_obj["cost"].get("total", 0.0) msg_data["cost_total"] = cost_val total_cost += cost_val message_count += 1 elif role == "toolResult": msg_data.update({ "tool_call_id": message_obj.get("toolCallId", ""), "tool_name": message_obj.get("toolName", ""), "is_error": message_obj.get("isError", False), "exit_code": message_obj.get("exitCode"), "duration_ms": message_obj.get("durationMs"), }) if message_obj.get("isError"): error_count += 1 if message_obj.get("toolCallId"): tool_results[message_obj["toolCallId"]] = { "result_text": content_text, "is_error": message_obj.get("isError", False), "exit_code": message_obj.get("exitCode"), "duration_ms": message_obj.get("durationMs"), } messages.append(msg_data) message_seq += 1 tc_seq = 0 for tc in tc_list: tool_call_data = { "session_id": session_id, "message_id": msg_id, "tool_call_id": tc.get("id", f"call_{msg_id}_{tc_seq}"), "tool_name": tc.get("name", "unknown"), "arguments": tc.get("arguments", {}), "seq": tc_seq, } tr = tool_results.get(tool_call_data["tool_call_id"], {}) tool_call_data["result_text"] = tr.get("result_text", "") tool_call_data["is_error"] = tr.get("is_error", False) tool_call_data["exit_code"] = tr.get("exit_code") tool_call_data["duration_ms"] = tr.get("duration_ms") tool_calls.append(tool_call_data) tool_call_count += 1 tc_seq += 1 start_time = timestamps[0] if timestamps else session_timestamp end_time = timestamps[-1] if timestamps else session_timestamp session_data = { "session_id": session_id, "session_version": session_version, "model_provider": current_model_provider, "model_id": current_model_id, "thinking_level": current_thinking_level, "start_time": start_time, "end_time": end_time, "cwd": session_cwd, "total_tokens": total_tokens, "total_cost": total_cost, "message_count": message_count, "tool_call_count": tool_call_count, "error_count": error_count, "raw_file_path": "", "status": "active", "metadata": {}, } sessions.append(session_data) return sessions, messages, tool_calls def parse_jsonl(file_path): """Parse a local JSONL file and return structured session data.""" with open(file_path, "r", encoding="utf-8") as f: content = f.read() sessions, messages, tool_calls = parse_jsonl_content(content) # Fill raw_file_path for local files for s in sessions: s["raw_file_path"] = str(file_path) return sessions, messages, tool_calls # ───────────────────────────────────────────────────────────────── # File Discovery (local) # ───────────────────────────────────────────────────────────────── def find_sessions(root_path): """Walk root_path/agents/*/sessions/ and yield (agent_name, jsonl_path).""" agents_dir = Path(root_path) / "agents" if not agents_dir.exists(): return for agent_folder in sorted(agents_dir.iterdir()): if not agent_folder.is_dir(): continue sessions_dir = agent_folder / SESSIONS_DIR_NAME if not sessions_dir.exists(): continue agent_name = agent_folder.name for jsonl_file in sorted(sessions_dir.glob("*.jsonl")): if DELETED_SUFFIX in jsonl_file.name: continue yield agent_name, str(jsonl_file) def get_sync_state(sessions_dir): """Read .sync_state from sessions directory, return {path: mtime}.""" state_path = Path(sessions_dir) / STATE_FILE if not state_path.exists(): return {} try: with open(state_path) as f: return json.load(f) except (json.JSONDecodeError, IOError): return {} def save_sync_state(sessions_dir, state): """Write .sync_state file.""" state_path = Path(sessions_dir) / STATE_FILE with open(state_path, "w") as f: json.dump(state, f) def get_new_files(root_path): """Find files that are new or modified since last sync.""" state = {} all_sessions_dirs = set() agents_dir = Path(root_path) / "agents" if agents_dir.exists(): for agent_folder in agents_dir.iterdir(): if agent_folder.is_dir(): sessions_dir = agent_folder / SESSIONS_DIR_NAME if sessions_dir.exists(): all_sessions_dirs.add(str(sessions_dir)) # Load existing state from all session dirs merged_state = {} for sd in all_sessions_dirs: sd_state = get_sync_state(sd) merged_state.update(sd_state) new_files = [] for agent_name, jsonl_path in find_sessions(root_path): stat = os.stat(jsonl_path) mtime = stat.st_mtime file_key = jsonl_path old_mtime = merged_state.get(file_key, 0) if mtime > old_mtime: new_files.append((agent_name, jsonl_path)) merged_state[file_key] = mtime # Save new state for sd in all_sessions_dirs: dir_files = {k: v for k, v in merged_state.items() if k.startswith(sd)} save_sync_state(sd, dir_files) return new_files # ───────────────────────────────────────────────────────────────── # Session Sync via SSH # ───────────────────────────────────────────────────────────────── def get_ssh_sync_state(state_file_path): """Read SSH session sync state, return {file_key: mtime}.""" p = Path(state_file_path) if not p.exists(): return {} try: with open(p) as f: return json.load(f) except (json.JSONDecodeError, IOError): return {} def save_ssh_sync_state(state_file_path, state): """Write SSH session sync state.""" p = Path(state_file_path) with open(p, "w") as f: json.dump(state, f) def sync_sessions_via_ssh(args): """Sync sessions from a remote host via SSH.""" ssh_host = args.sync_ssh source_path = args.sync_source_path.rstrip("/") print(f"[SSH Session Sync] host={ssh_host}, source={source_path}") # Enumerate all agents on remote agents_dir = f"{source_path}/agents" print(f" Scanning {ssh_host}:{agents_dir}/ ...") agent_folders = ssh_list_dir(ssh_host, agents_dir) # Load sync state state_file = Path.home() / SESSION_STATE_FILE prev_state = get_ssh_sync_state(str(state_file)) new_state = {} new_files = [] # (agent_name, remote_file_path) for agent_folder in sorted(agent_folders): sessions_dir = f"{agents_dir}/{agent_folder}/{SESSIONS_DIR_NAME}" # List .jsonl files, excluding deleted raw_files = ssh_list_files(ssh_host, sessions_dir, "*.jsonl") jsonl_files = [ f for f in raw_files if DELETED_SUFFIX not in f and f.endswith(".jsonl") ] if not jsonl_files: continue # Batch-fetch mtimes for all files in this agent's sessions dir all_mtimes = ssh_stat_mtime_batch(ssh_host, sorted(jsonl_files), sessions_dir) for jsonl_file in sorted(jsonl_files): remote_full = f"{sessions_dir}/{jsonl_file}" mtime = all_mtimes.get(remote_full) if mtime is None: continue old_mtime = prev_state.get(remote_full, 0) if mtime > old_mtime: new_files.append((agent_folder, remote_full)) new_state[remote_full] = mtime if not new_files: print(" No new or modified session files found.") save_ssh_sync_state(str(state_file), new_state) return print(f" Found {len(new_files)} new/modified session file(s).") # Save new state save_ssh_sync_state(str(state_file), new_state) # Group by agent_name agent_batches = {} for agent_name, remote_path in new_files: agent_batches.setdefault(agent_name, []).append(remote_path) total_sessions = 0 total_messages = 0 total_tool_calls = 0 for agent_name, remote_paths in agent_batches.items(): all_sessions = [] all_messages = [] all_tool_calls = [] for remote_path in remote_paths: print(f" Parsing: {remote_path}") try: content = ssh_read_file(ssh_host, remote_path) sessions, messages, tool_calls = parse_jsonl_content(content) # Fill raw_file_path with remote path for s in sessions: s["raw_file_path"] = remote_path all_sessions.extend(sessions) all_messages.extend(messages) all_tool_calls.extend(tool_calls) except Exception as e: print(f" ERROR reading {remote_path}: {e}") continue if not all_sessions: continue payload = { "agent_name": agent_name, "source_node": ssh_host, "sessions": all_sessions, "messages": all_messages, "tool_calls": all_tool_calls, } print(f" Pushing {len(all_sessions)} session(s), " f"{len(all_messages)} message(s), " f"{len(all_tool_calls)} tool call(s) ...") try: result = push_to_api(args.remote_url, payload) print(f" OK: sessions_upserted={result.get('sessions_upserted', 0)}, " f"messages_upserted={result.get('messages_upserted', 0)}, " f"tool_calls_upserted={result.get('tool_calls_upserted', 0)}") total_sessions += result.get("sessions_upserted", 0) total_messages += result.get("messages_upserted", 0) total_tool_calls += result.get("tool_calls_upserted", 0) except Exception as e: print(f" FAILED to push {agent_name} sessions: {e}") print(f"\nSync complete: {total_sessions} sessions, " f"{total_messages} messages, {total_tool_calls} tool calls pushed.") # ───────────────────────────────────────────────────────────────── # Cron Sync Mode # ───────────────────────────────────────────────────────────────── def get_cron_state(state_file_path): """Read cron sync state, return {run_file: mtime}.""" p = Path(state_file_path) if not p.exists(): return {} try: with open(p) as f: return json.load(f) except (json.JSONDecodeError, IOError): return {} def save_cron_state(state_file_path, state): """Write cron sync state.""" p = Path(state_file_path) with open(p, "w") as f: json.dump(state, f) def sync_cron_jobs(args): """Sync cron jobs from openclaw cron data.""" ssh_host = args.cron_ssh jobs_path = args.cron_jobs_path runs_path = args.cron_runs_path.rstrip("/") print(f"[Cron Sync] host={ssh_host}, jobs={jobs_path}") try: jobs_raw = ssh_read_file(ssh_host, jobs_path) jobs_data = json.loads(jobs_raw) except Exception as e: print(f"ERROR reading jobs.json: {e}") return jobs = jobs_data.get("jobs", []) job_ids = {j["id"] for j in jobs} print(f" Found {len(jobs)} jobs") # Find runs files, filter to only those matching known job IDs print(f"Scanning runs directory {ssh_host}:{runs_path}/ ...") all_run_files = ssh_list_files(ssh_host, runs_path, "*.jsonl") run_files = [f for f in all_run_files if Path(f).stem in job_ids] print(f" Found {len(run_files)} run files matching known job IDs") # Load sync state state_file = Path.home() / CRON_STATE_FILE prev_state = get_cron_state(str(state_file)) new_runs = [] new_state = {} # Batch-fetch mtimes for all run files all_run_mtimes = ssh_stat_mtime_batch(ssh_host, [Path(f).name for f in run_files], runs_path) for run_file in run_files: remote_full = f"{runs_path}/{Path(run_file).name}" mtime = all_run_mtimes.get(remote_full) if mtime is None: continue old_mtime = prev_state.get(remote_full, 0) if mtime > old_mtime: new_runs.append(remote_full) new_state[remote_full] = mtime if not new_runs: print(" No new or modified run files found.") save_cron_state(str(state_file), new_state) return print(f" Parsing {len(new_runs)} new/modified run file(s) ...") all_runs = [] for run_file in new_runs: print(f" Parsing: {run_file}") try: raw = ssh_read_file(ssh_host, run_file) for line in raw.strip().split("\n"): line = line.strip() if not line: continue try: run_obj = json.loads(line) all_runs.append(run_obj) except json.JSONDecodeError: continue except Exception as e: print(f" ERROR reading {run_file}: {e}") continue if not all_runs: print(" No run records parsed.") save_cron_state(str(state_file), new_state) return # Save new state save_cron_state(str(state_file), new_state) payload = { "source_node": os.environ.get("SOURCE_NODE", ssh_host), "jobs": jobs, "runs": all_runs, } print(f" Pushing {len(jobs)} jobs and {len(all_runs)} runs to {args.remote_url} ...") try: result = push_to_api(args.remote_url, payload) print(f" OK: jobs_upserted={result.get('jobs_upserted', 0)}, " f"runs_upserted={result.get('runs_upserted', 0)}") except Exception as e: print(f" FAILED to push cron data: {e}") # ───────────────────────────────────────────────────────────────── # HTTP Client # ───────────────────────────────────────────────────────────────── def push_to_api(remote_url, payload): """POST structured JSON to Django API.""" data = json.dumps(payload).encode("utf-8") req = urllib.request.Request( remote_url, data=data, headers={"Content-Type": "application/json"}, method="POST", ) try: with urllib.request.urlopen(req, timeout=120) as resp: return json.loads(resp.read()) except urllib.error.HTTPError as e: print(f"HTTP Error {e.code}: {e.read().decode('utf-8', errors='replace')}") raise except urllib.error.URLError as e: print(f"URL Error: {e.reason}") raise except Exception as e: print(f"Error: {e}") raise # ───────────────────────────────────────────────────────────────── # Main # ───────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser( description="Sync OpenClaw sessions or cron data to Django API" ) parser.add_argument( "--remote-url", required=True, help="Django API bulk_upsert endpoint URL", ) parser.add_argument( "--root-path", default=".", help="Local root path containing agents/ directory (default: current dir)", ) parser.add_argument( "--sync-ssh", metavar="HOST", help="Sync sessions from a remote host via SSH (e.g. ubuntu1, macmini)", ) parser.add_argument( "--sync-source-path", default="/home/shenwei/.openclaw", help="Remote source path for --sync-ssh (default: /home/shenwei/.openclaw)", ) parser.add_argument( "--cron", action="store_true", help="Sync cron jobs and runs instead of session files", ) parser.add_argument( "--cron-ssh", default="macmini", help="SSH host for cron data (default: macmini)", ) parser.add_argument( "--cron-jobs-path", default="/Users/weishen/openclaw/cron/jobs.json", help="Remote path to jobs.json", ) parser.add_argument( "--cron-runs-path", default="/Users/weishen/openclaw/cron/runs/", help="Remote directory containing run JSONL files", ) args = parser.parse_args() # SSH session sync mode takes priority if args.sync_ssh: sync_sessions_via_ssh(args) return if args.cron: sync_cron_jobs(args) return # Local session sync mode new_files = get_new_files(args.root_path) if not new_files: print("No new or modified session files found.") return print(f"Found {len(new_files)} new/modified session(s).") total_sessions = 0 total_messages = 0 total_tool_calls = 0 agent_batches = {} for agent_name, jsonl_path in new_files: agent_batches.setdefault(agent_name, []).append(jsonl_path) for agent_name, file_paths in agent_batches.items(): all_sessions = [] all_messages = [] all_tool_calls = [] for fp in file_paths: print(f" Parsing: {fp}") try: sessions, messages, tool_calls = parse_jsonl(fp) all_sessions.extend(sessions) all_messages.extend(messages) all_tool_calls.extend(tool_calls) except Exception as e: print(f" ERROR parsing {fp}: {e}") continue if not all_sessions: continue payload = { "agent_name": agent_name, "source_node": os.environ.get("SOURCE_NODE", "unknown"), "sessions": all_sessions, "messages": all_messages, "tool_calls": all_tool_calls, } print(f" Pushing {len(all_sessions)} session(s), " f"{len(all_messages)} message(s), " f"{len(all_tool_calls)} tool call(s) ...") try: result = push_to_api(args.remote_url, payload) print(f" OK: sessions_upserted={result.get('sessions_upserted', 0)}, " f"messages_upserted={result.get('messages_upserted', 0)}, " f"tool_calls_upserted={result.get('tool_calls_upserted', 0)}") total_sessions += result.get("sessions_upserted", 0) total_messages += result.get("messages_upserted", 0) total_tool_calls += result.get("tool_calls_upserted", 0) except Exception: print(f" FAILED to push {agent_name} sessions.") print(f"\nSync complete: {total_sessions} sessions, " f"{total_messages} messages, {total_tool_calls} tool calls pushed.") if __name__ == "__main__": main()