598 lines
21 KiB
Python
Executable File
598 lines
21 KiB
Python
Executable File
#!/usr/bin/env python
|
|
"""
|
|
OpenClaw Session Sync Script
|
|
|
|
Scans local agent sessions directories, parses JSONL files,
|
|
and pushes structured JSON to the Django API.
|
|
|
|
Usage:
|
|
# Session sync (existing)
|
|
python sync_sessions.py --remote-url http://macmini:8000/api/sessions/bulk_upsert/
|
|
|
|
# Cron job sync (new)
|
|
python sync_sessions.py --cron \
|
|
--remote-url http://macmini:8000/api/cron/bulk_upsert/ \
|
|
--cron-ssh macmini \
|
|
--cron-jobs-path /Users/weishen/openclaw/cron/jobs.json \
|
|
--cron-runs-path /Users/weishen/openclaw/cron/runs/
|
|
|
|
Cron:
|
|
0 2 * * * cd /path/to/scripts && python sync_sessions.py --remote-url <url>
|
|
0 3 * * * cd /path/to/scripts && python sync_sessions.py --cron --remote-url <cron-url> ...
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import urllib.error
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
# ─────────────────────────────────────────────────────────────────
|
|
# Configuration
|
|
# ─────────────────────────────────────────────────────────────────
|
|
|
|
SESSIONS_DIR_NAME = "sessions"
|
|
STATE_FILE = ".sync_state"
|
|
DELETED_SUFFIX = ".deleted."
|
|
CRON_STATE_FILE = ".sync_cron_state"
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────
|
|
# SSH Helper
|
|
# ─────────────────────────────────────────────────────────────────
|
|
|
|
def ssh_read_file(host, remote_path):
|
|
"""Read a remote file via SSH and return content as string."""
|
|
result = subprocess.run(
|
|
["ssh", host, f"cat {remote_path}"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60,
|
|
)
|
|
if result.returncode != 0:
|
|
raise RuntimeError(f"SSH read failed for {host}:{remote_path}: {result.stderr}")
|
|
return result.stdout
|
|
|
|
|
|
def ssh_list_files(host, remote_dir, pattern="*.jsonl"):
|
|
"""List remote files matching pattern via SSH."""
|
|
result = subprocess.run(
|
|
["ssh", host, f"ls {remote_dir}/{pattern}"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
)
|
|
if result.returncode != 0:
|
|
return []
|
|
return [f.strip() for f in result.stdout.strip().split("\n") if f.strip()]
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────
|
|
# File Discovery
|
|
# ─────────────────────────────────────────────────────────────────
|
|
|
|
def find_sessions(root_path):
|
|
"""Walk root_path/agents/*/sessions/ and yield (agent_name, jsonl_path)."""
|
|
agents_dir = Path(root_path) / "agents"
|
|
if not agents_dir.exists():
|
|
return
|
|
for agent_folder in sorted(agents_dir.iterdir()):
|
|
if not agent_folder.is_dir():
|
|
continue
|
|
sessions_dir = agent_folder / SESSIONS_DIR_NAME
|
|
if not sessions_dir.exists():
|
|
continue
|
|
agent_name = agent_folder.name
|
|
for jsonl_file in sorted(sessions_dir.glob("*.jsonl")):
|
|
if DELETED_SUFFIX in jsonl_file.name:
|
|
continue
|
|
yield agent_name, str(jsonl_file)
|
|
|
|
|
|
def get_sync_state(sessions_dir):
|
|
"""Read .sync_state from sessions directory, return {path: mtime}."""
|
|
state_path = Path(sessions_dir) / STATE_FILE
|
|
if not state_path.exists():
|
|
return {}
|
|
try:
|
|
with open(state_path) as f:
|
|
return json.load(f)
|
|
except (json.JSONDecodeError, IOError):
|
|
return {}
|
|
|
|
|
|
def save_sync_state(sessions_dir, state):
|
|
"""Write .sync_state file."""
|
|
state_path = Path(sessions_dir) / STATE_FILE
|
|
with open(state_path, "w") as f:
|
|
json.dump(state, f)
|
|
|
|
|
|
def get_new_files(root_path):
|
|
"""Find files that are new or modified since last sync."""
|
|
state = {}
|
|
all_sessions_dirs = set()
|
|
|
|
agents_dir = Path(root_path) / "agents"
|
|
if agents_dir.exists():
|
|
for agent_folder in agents_dir.iterdir():
|
|
if agent_folder.is_dir():
|
|
sessions_dir = agent_folder / SESSIONS_DIR_NAME
|
|
if sessions_dir.exists():
|
|
all_sessions_dirs.add(str(sessions_dir))
|
|
|
|
# Load existing state from all session dirs
|
|
merged_state = {}
|
|
for sd in all_sessions_dirs:
|
|
sd_state = get_sync_state(sd)
|
|
merged_state.update(sd_state)
|
|
|
|
new_files = []
|
|
for agent_name, jsonl_path in find_sessions(root_path):
|
|
stat = os.stat(jsonl_path)
|
|
mtime = stat.st_mtime
|
|
file_key = jsonl_path
|
|
old_mtime = merged_state.get(file_key, 0)
|
|
if mtime > old_mtime:
|
|
new_files.append((agent_name, jsonl_path))
|
|
merged_state[file_key] = mtime
|
|
|
|
# Save new state
|
|
for sd in all_sessions_dirs:
|
|
dir_files = {k: v for k, v in merged_state.items() if k.startswith(sd)}
|
|
save_sync_state(sd, dir_files)
|
|
|
|
return new_files
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────
|
|
# JSONL Parser (Session mode)
|
|
# ─────────────────────────────────────────────────────────────────
|
|
|
|
def parse_jsonl(file_path):
|
|
"""Parse a JSONL file and return structured session data."""
|
|
sessions = []
|
|
messages = []
|
|
tool_calls = []
|
|
|
|
current_model_provider = ""
|
|
current_model_id = ""
|
|
current_thinking_level = ""
|
|
tool_results = {}
|
|
|
|
events = []
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
event = json.loads(line)
|
|
events.append(event)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
if not events:
|
|
return sessions, messages, tool_calls
|
|
|
|
session_event = None
|
|
for event in events:
|
|
event_type = event.get("type", "")
|
|
if event_type == "session":
|
|
session_event = event
|
|
break
|
|
|
|
if not session_event:
|
|
return sessions, messages, tool_calls
|
|
|
|
session_id = session_event.get("id", "")
|
|
session_timestamp = session_event.get("timestamp", "")
|
|
session_cwd = session_event.get("cwd", "")
|
|
session_version = events[-1].get("version", 0) if events else 0
|
|
|
|
timestamps = []
|
|
for event in events:
|
|
ts = event.get("timestamp", "")
|
|
if ts:
|
|
timestamps.append(ts)
|
|
|
|
message_seq = 0
|
|
total_tokens = 0
|
|
total_cost = 0.0
|
|
message_count = 0
|
|
tool_call_count = 0
|
|
error_count = 0
|
|
|
|
for event in events:
|
|
event_type = event.get("type", "")
|
|
|
|
if event_type == "model_change":
|
|
current_model_provider = event.get("provider", "")
|
|
current_model_id = event.get("modelId", "")
|
|
|
|
elif event_type == "thinking_level_change":
|
|
current_thinking_level = event.get("thinkingLevel", "")
|
|
|
|
elif event_type == "message":
|
|
message_obj = event.get("message", {})
|
|
role = message_obj.get("role", "")
|
|
msg_id = event.get("id", "")
|
|
parent_id = event.get("parentId", "")
|
|
msg_timestamp = event.get("timestamp", "")
|
|
|
|
content_items = message_obj.get("content", [])
|
|
text_parts = []
|
|
tc_list = []
|
|
for item in content_items:
|
|
if isinstance(item, dict):
|
|
if item.get("type") == "text":
|
|
text_parts.append(item.get("text", ""))
|
|
elif item.get("type") == "toolCall":
|
|
tc_list.append(item)
|
|
|
|
content_text = "\n".join(text_parts)
|
|
|
|
msg_data = {
|
|
"session_id": session_id,
|
|
"message_id": msg_id,
|
|
"parent_id": parent_id or "",
|
|
"seq": message_seq,
|
|
"role": role or "",
|
|
"content_text": content_text,
|
|
"raw_content": content_items if content_items else [],
|
|
"raw_message": message_obj.get("content", []),
|
|
"timestamp": msg_timestamp,
|
|
}
|
|
|
|
if role == "assistant":
|
|
usage = message_obj.get("usage", {})
|
|
msg_data.update({
|
|
"model": current_model_id,
|
|
"provider": current_model_provider,
|
|
"stop_reason": message_obj.get("stopReason", ""),
|
|
"tokens_input": usage.get("inputTokens", 0),
|
|
"tokens_output": usage.get("outputTokens", 0),
|
|
"tokens_cache_read": usage.get("cacheReadInputTokens", 0),
|
|
"tokens_cache_write": usage.get("cacheWriteInputTokens", 0),
|
|
"tokens_total": usage.get("totalTokens", 0),
|
|
})
|
|
total_tokens += usage.get("totalTokens", 0)
|
|
|
|
if message_obj.get("cost"):
|
|
cost_val = message_obj["cost"].get("total", 0.0)
|
|
msg_data["cost_total"] = cost_val
|
|
total_cost += cost_val
|
|
|
|
message_count += 1
|
|
|
|
elif role == "toolResult":
|
|
msg_data.update({
|
|
"tool_call_id": message_obj.get("toolCallId", ""),
|
|
"tool_name": message_obj.get("toolName", ""),
|
|
"is_error": message_obj.get("isError", False),
|
|
"exit_code": message_obj.get("exitCode"),
|
|
"duration_ms": message_obj.get("durationMs"),
|
|
})
|
|
if message_obj.get("isError"):
|
|
error_count += 1
|
|
if message_obj.get("toolCallId"):
|
|
tool_results[message_obj["toolCallId"]] = {
|
|
"result_text": content_text,
|
|
"is_error": message_obj.get("isError", False),
|
|
"exit_code": message_obj.get("exitCode"),
|
|
"duration_ms": message_obj.get("durationMs"),
|
|
}
|
|
|
|
messages.append(msg_data)
|
|
message_seq += 1
|
|
|
|
tc_seq = 0
|
|
for tc in tc_list:
|
|
tool_call_data = {
|
|
"session_id": session_id,
|
|
"message_id": msg_id,
|
|
"tool_call_id": tc.get("id", f"call_{msg_id}_{tc_seq}"),
|
|
"tool_name": tc.get("name", "unknown"),
|
|
"arguments": tc.get("arguments", {}),
|
|
"seq": tc_seq,
|
|
}
|
|
tr = tool_results.get(tool_call_data["tool_call_id"], {})
|
|
tool_call_data["result_text"] = tr.get("result_text", "")
|
|
tool_call_data["is_error"] = tr.get("is_error", False)
|
|
tool_call_data["exit_code"] = tr.get("exit_code")
|
|
tool_call_data["duration_ms"] = tr.get("duration_ms")
|
|
tool_calls.append(tool_call_data)
|
|
tool_call_count += 1
|
|
tc_seq += 1
|
|
|
|
start_time = timestamps[0] if timestamps else session_timestamp
|
|
end_time = timestamps[-1] if timestamps else session_timestamp
|
|
|
|
session_data = {
|
|
"session_id": session_id,
|
|
"session_version": session_version,
|
|
"model_provider": current_model_provider,
|
|
"model_id": current_model_id,
|
|
"thinking_level": current_thinking_level,
|
|
"start_time": start_time,
|
|
"end_time": end_time,
|
|
"cwd": session_cwd,
|
|
"total_tokens": total_tokens,
|
|
"total_cost": total_cost,
|
|
"message_count": message_count,
|
|
"tool_call_count": tool_call_count,
|
|
"error_count": error_count,
|
|
"raw_file_path": str(file_path),
|
|
"status": "active",
|
|
"metadata": {},
|
|
}
|
|
|
|
sessions.append(session_data)
|
|
return sessions, messages, tool_calls
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────
|
|
# Cron Sync Mode
|
|
# ─────────────────────────────────────────────────────────────────
|
|
|
|
def get_cron_state(state_file_path):
|
|
"""Read cron sync state, return {run_file: mtime}."""
|
|
p = Path(state_file_path)
|
|
if not p.exists():
|
|
return {}
|
|
try:
|
|
with open(p) as f:
|
|
return json.load(f)
|
|
except (json.JSONDecodeError, IOError):
|
|
return {}
|
|
|
|
|
|
def save_cron_state(state_file_path, state):
|
|
"""Write cron sync state."""
|
|
p = Path(state_file_path)
|
|
with open(p, "w") as f:
|
|
json.dump(state, f)
|
|
|
|
|
|
def sync_cron_jobs(args):
|
|
"""Sync cron jobs from openclaw cron data."""
|
|
ssh_host = args.cron_ssh
|
|
jobs_path = args.cron_jobs_path
|
|
runs_path = args.cron_runs_path.rstrip("/")
|
|
|
|
print(f"Fetching jobs.json from {ssh_host}:{jobs_path}...")
|
|
try:
|
|
jobs_raw = ssh_read_file(ssh_host, jobs_path)
|
|
jobs_data = json.loads(jobs_raw)
|
|
except Exception as e:
|
|
print(f"ERROR reading jobs.json: {e}")
|
|
return
|
|
|
|
jobs = jobs_data.get("jobs", [])
|
|
job_ids = {j["id"] for j in jobs}
|
|
print(f" Found {len(jobs)} jobs")
|
|
|
|
# Find runs files, filter to only those matching known job IDs
|
|
print(f"Scanning runs directory {ssh_host}:{runs_path}/...")
|
|
all_run_files = ssh_list_files(ssh_host, runs_path, "*.jsonl")
|
|
run_files = [f for f in all_run_files if Path(f).stem in job_ids]
|
|
print(f" Found {len(run_files)} run files matching known job IDs")
|
|
|
|
# Load sync state
|
|
state_file = Path.home() / ".sync_cron_state"
|
|
prev_state = get_cron_state(str(state_file))
|
|
|
|
new_runs = []
|
|
new_state = {}
|
|
|
|
# Detect remote platform (Linux vs macOS) for stat syntax
|
|
uname_result = subprocess.run(
|
|
["ssh", ssh_host, "uname"],
|
|
capture_output=True, text=True, timeout=10,
|
|
)
|
|
is_macos = uname_result.stdout.strip() == "Darwin"
|
|
stat_cmd = f"stat -f %m" if is_macos else f"stat -c %Y"
|
|
|
|
for run_file in run_files:
|
|
remote_full = f"{runs_path}/{Path(run_file).name}"
|
|
# Get mtime via SSH
|
|
result = subprocess.run(
|
|
["ssh", ssh_host, f"{stat_cmd} {remote_full}"],
|
|
capture_output=True, text=True, timeout=10,
|
|
)
|
|
if result.returncode != 0:
|
|
continue
|
|
try:
|
|
mtime = int(result.stdout.strip())
|
|
except ValueError:
|
|
continue
|
|
|
|
old_mtime = prev_state.get(remote_full, 0)
|
|
if mtime > old_mtime:
|
|
new_runs.append(remote_full)
|
|
new_state[remote_full] = mtime
|
|
|
|
if not new_runs:
|
|
print("No new or modified run files found.")
|
|
save_cron_state(str(state_file), new_state)
|
|
return
|
|
|
|
print(f"Parsing {len(new_runs)} new/modified run file(s)...")
|
|
|
|
all_runs = []
|
|
for run_file in new_runs:
|
|
print(f" Parsing: {run_file}")
|
|
try:
|
|
raw = ssh_read_file(ssh_host, run_file)
|
|
for line in raw.strip().split("\n"):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
run_obj = json.loads(line)
|
|
all_runs.append(run_obj)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
except Exception as e:
|
|
print(f" ERROR reading {run_file}: {e}")
|
|
continue
|
|
|
|
if not all_runs:
|
|
print("No run records parsed.")
|
|
save_cron_state(str(state_file), new_state)
|
|
return
|
|
|
|
# Save new state
|
|
save_cron_state(str(state_file), new_state)
|
|
|
|
payload = {
|
|
"source_node": os.environ.get("SOURCE_NODE", ssh_host),
|
|
"jobs": jobs,
|
|
"runs": all_runs,
|
|
}
|
|
|
|
print(f"Pushing {len(jobs)} jobs and {len(all_runs)} runs to {args.remote_url}...")
|
|
try:
|
|
result = push_to_api(args.remote_url, payload)
|
|
print(f" OK: jobs_upserted={result.get('jobs_upserted', 0)}, "
|
|
f"runs_upserted={result.get('runs_upserted', 0)}")
|
|
except Exception as e:
|
|
print(f" FAILED to push cron data: {e}")
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────
|
|
# HTTP Client
|
|
# ─────────────────────────────────────────────────────────────────
|
|
|
|
def push_to_api(remote_url, payload):
|
|
"""POST structured JSON to Django API."""
|
|
data = json.dumps(payload).encode("utf-8")
|
|
req = urllib.request.Request(
|
|
remote_url,
|
|
data=data,
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
return json.loads(resp.read())
|
|
except urllib.error.HTTPError as e:
|
|
print(f"HTTP Error {e.code}: {e.read().decode('utf-8', errors='replace')}")
|
|
raise
|
|
except urllib.error.URLError as e:
|
|
print(f"URL Error: {e.reason}")
|
|
raise
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
raise
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────
|
|
# Main
|
|
# ─────────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Sync OpenClaw sessions or cron data to Django API")
|
|
parser.add_argument(
|
|
"--remote-url",
|
|
required=True,
|
|
help="Django API bulk_upsert endpoint URL",
|
|
)
|
|
parser.add_argument(
|
|
"--root-path",
|
|
default=".",
|
|
help="Root path containing agents/ directory (default: current dir)",
|
|
)
|
|
parser.add_argument(
|
|
"--cron",
|
|
action="store_true",
|
|
help="Sync cron jobs and runs instead of session files",
|
|
)
|
|
parser.add_argument(
|
|
"--cron-ssh",
|
|
default="macmini",
|
|
help="SSH host for cron data (default: macmini)",
|
|
)
|
|
parser.add_argument(
|
|
"--cron-jobs-path",
|
|
default="/Users/weishen/openclaw/cron/jobs.json",
|
|
help="Remote path to jobs.json",
|
|
)
|
|
parser.add_argument(
|
|
"--cron-runs-path",
|
|
default="/Users/weishen/openclaw/cron/runs/",
|
|
help="Remote directory containing run JSONL files",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.cron:
|
|
sync_cron_jobs(args)
|
|
return
|
|
|
|
# Original session sync mode
|
|
new_files = get_new_files(args.root_path)
|
|
if not new_files:
|
|
print("No new or modified session files found.")
|
|
return
|
|
|
|
print(f"Found {len(new_files)} new/modified session(s).")
|
|
|
|
total_sessions = 0
|
|
total_messages = 0
|
|
total_tool_calls = 0
|
|
|
|
agent_batches = {}
|
|
for agent_name, jsonl_path in new_files:
|
|
agent_batches.setdefault(agent_name, []).append(jsonl_path)
|
|
|
|
for agent_name, file_paths in agent_batches.items():
|
|
all_sessions = []
|
|
all_messages = []
|
|
all_tool_calls = []
|
|
|
|
for fp in file_paths:
|
|
print(f" Parsing: {fp}")
|
|
try:
|
|
sessions, messages, tool_calls = parse_jsonl(fp)
|
|
all_sessions.extend(sessions)
|
|
all_messages.extend(messages)
|
|
all_tool_calls.extend(tool_calls)
|
|
except Exception as e:
|
|
print(f" ERROR parsing {fp}: {e}")
|
|
continue
|
|
|
|
if not all_sessions:
|
|
continue
|
|
|
|
payload = {
|
|
"agent_name": agent_name,
|
|
"source_node": os.environ.get("SOURCE_NODE", "unknown"),
|
|
"sessions": all_sessions,
|
|
"messages": all_messages,
|
|
"tool_calls": all_tool_calls,
|
|
}
|
|
|
|
print(f" Pushing {len(all_sessions)} session(s), "
|
|
f"{len(all_messages)} message(s), "
|
|
f"{len(all_tool_calls)} tool call(s)...")
|
|
|
|
try:
|
|
result = push_to_api(args.remote_url, payload)
|
|
print(f" OK: {result}")
|
|
total_sessions += result.get("sessions_upserted", 0)
|
|
total_messages += result.get("messages_upserted", 0)
|
|
total_tool_calls += result.get("tool_calls_upserted", 0)
|
|
except Exception:
|
|
print(f" FAILED to push {agent_name} sessions.")
|
|
|
|
print(f"\nSync complete: {total_sessions} sessions, "
|
|
f"{total_messages} messages, {total_tool_calls} tool calls pushed.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|