agent-base/scripts/sync_sessions.py

#!/usr/bin/env python
"""
OpenClaw Session Sync Script

Scans local agent sessions directories, parses JSONL files,
and pushes structured JSON to the Django API.

Usage:
    # Session sync (existing)
    python sync_sessions.py --remote-url http://macmini:8000/api/sessions/bulk_upsert/

    # Cron job sync (new)
    python sync_sessions.py --cron \
        --remote-url http://macmini:8000/api/cron/bulk_upsert/ \
        --cron-ssh macmini \
        --cron-jobs-path /Users/weishen/openclaw/cron/jobs.json \
        --cron-runs-path /Users/weishen/openclaw/cron/runs/

Cron:
    0 2 * * * cd /path/to/scripts && python sync_sessions.py --remote-url <url>
    0 3 * * * cd /path/to/scripts && python sync_sessions.py --cron --remote-url <cron-url> ...
"""

import argparse
import json
import os
import subprocess
import sys
import urllib.error
import urllib.request
from pathlib import Path

# ─────────────────────────────────────────────────────────────────
# Configuration
# ─────────────────────────────────────────────────────────────────

SESSIONS_DIR_NAME = "sessions"
STATE_FILE = ".sync_state"
DELETED_SUFFIX = ".deleted."
CRON_STATE_FILE = ".sync_cron_state"


# ─────────────────────────────────────────────────────────────────
# SSH Helper
# ─────────────────────────────────────────────────────────────────

def ssh_read_file(host, remote_path):
    """Read a remote file via SSH and return content as string."""
    result = subprocess.run(
        ["ssh", host, f"cat {remote_path}"],
        capture_output=True,
        text=True,
        timeout=60,
    )
    if result.returncode != 0:
        raise RuntimeError(f"SSH read failed for {host}:{remote_path}: {result.stderr}")
    return result.stdout


def ssh_list_files(host, remote_dir, pattern="*.jsonl"):
    """List remote files matching pattern via SSH."""
    result = subprocess.run(
        ["ssh", host, f"ls {remote_dir}/{pattern}"],
        capture_output=True,
        text=True,
        timeout=30,
    )
    if result.returncode != 0:
        return []
    return [f.strip() for f in result.stdout.strip().split("\n") if f.strip()]


# ─────────────────────────────────────────────────────────────────
# File Discovery
# ─────────────────────────────────────────────────────────────────

def find_sessions(root_path):
    """Walk root_path/agents/*/sessions/ and yield (agent_name, jsonl_path)."""
    agents_dir = Path(root_path) / "agents"
    if not agents_dir.exists():
        return
    for agent_folder in sorted(agents_dir.iterdir()):
        if not agent_folder.is_dir():
            continue
        sessions_dir = agent_folder / SESSIONS_DIR_NAME
        if not sessions_dir.exists():
            continue
        agent_name = agent_folder.name
        for jsonl_file in sorted(sessions_dir.glob("*.jsonl")):
            if DELETED_SUFFIX in jsonl_file.name:
                continue
            yield agent_name, str(jsonl_file)


def get_sync_state(sessions_dir):
    """Read .sync_state from sessions directory, return {path: mtime}."""
    state_path = Path(sessions_dir) / STATE_FILE
    if not state_path.exists():
        return {}
    try:
        with open(state_path) as f:
            return json.load(f)
    except (json.JSONDecodeError, IOError):
        return {}


def save_sync_state(sessions_dir, state):
    """Write .sync_state file."""
    state_path = Path(sessions_dir) / STATE_FILE
    with open(state_path, "w") as f:
        json.dump(state, f)


def get_new_files(root_path):
    """Find files that are new or modified since last sync."""
    state = {}
    all_sessions_dirs = set()

    agents_dir = Path(root_path) / "agents"
    if agents_dir.exists():
        for agent_folder in agents_dir.iterdir():
            if agent_folder.is_dir():
                sessions_dir = agent_folder / SESSIONS_DIR_NAME
                if sessions_dir.exists():
                    all_sessions_dirs.add(str(sessions_dir))

    # Load existing state from all session dirs
    merged_state = {}
    for sd in all_sessions_dirs:
        sd_state = get_sync_state(sd)
        merged_state.update(sd_state)

    new_files = []
    for agent_name, jsonl_path in find_sessions(root_path):
        stat = os.stat(jsonl_path)
        mtime = stat.st_mtime
        file_key = jsonl_path
        old_mtime = merged_state.get(file_key, 0)
        if mtime > old_mtime:
            new_files.append((agent_name, jsonl_path))
        merged_state[file_key] = mtime

    # Save new state
    for sd in all_sessions_dirs:
        dir_files = {k: v for k, v in merged_state.items() if k.startswith(sd)}
        save_sync_state(sd, dir_files)

    return new_files


# ─────────────────────────────────────────────────────────────────
# JSONL Parser (Session mode)
# ─────────────────────────────────────────────────────────────────

def parse_jsonl(file_path):
    """Parse a JSONL file and return structured session data."""
    sessions = []
    messages = []
    tool_calls = []

    current_model_provider = ""
    current_model_id = ""
    current_thinking_level = ""
    tool_results = {}

    events = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                event = json.loads(line)
                events.append(event)
            except json.JSONDecodeError:
                continue

    if not events:
        return sessions, messages, tool_calls

    session_event = None
    for event in events:
        event_type = event.get("type", "")
        if event_type == "session":
            session_event = event
            break

    if not session_event:
        return sessions, messages, tool_calls

    session_id = session_event.get("id", "")
    session_timestamp = session_event.get("timestamp", "")
    session_cwd = session_event.get("cwd", "")
    session_version = events[-1].get("version", 0) if events else 0

    timestamps = []
    for event in events:
        ts = event.get("timestamp", "")
        if ts:
            timestamps.append(ts)

    message_seq = 0
    total_tokens = 0
    total_cost = 0.0
    message_count = 0
    tool_call_count = 0
    error_count = 0

    for event in events:
        event_type = event.get("type", "")

        if event_type == "model_change":
            current_model_provider = event.get("provider", "")
            current_model_id = event.get("modelId", "")

        elif event_type == "thinking_level_change":
            current_thinking_level = event.get("thinkingLevel", "")

        elif event_type == "message":
            message_obj = event.get("message", {})
            role = message_obj.get("role", "")
            msg_id = event.get("id", "")
            parent_id = event.get("parentId", "")
            msg_timestamp = event.get("timestamp", "")

            content_items = message_obj.get("content", [])
            text_parts = []
            tc_list = []
            for item in content_items:
                if isinstance(item, dict):
                    if item.get("type") == "text":
                        text_parts.append(item.get("text", ""))
                    elif item.get("type") == "toolCall":
                        tc_list.append(item)

            content_text = "\n".join(text_parts)

            msg_data = {
                "session_id": session_id,
                "message_id": msg_id,
                "parent_id": parent_id or "",
                "seq": message_seq,
                "role": role or "",
                "content_text": content_text,
                "raw_content": content_items if content_items else [],
                "raw_message": message_obj.get("content", []),
                "timestamp": msg_timestamp,
            }

            if role == "assistant":
                usage = message_obj.get("usage", {})
                msg_data.update({
                    "model": current_model_id,
                    "provider": current_model_provider,
                    "stop_reason": message_obj.get("stopReason", ""),
                    "tokens_input": usage.get("inputTokens", 0),
                    "tokens_output": usage.get("outputTokens", 0),
                    "tokens_cache_read": usage.get("cacheReadInputTokens", 0),
                    "tokens_cache_write": usage.get("cacheWriteInputTokens", 0),
                    "tokens_total": usage.get("totalTokens", 0),
                })
                total_tokens += usage.get("totalTokens", 0)

                if message_obj.get("cost"):
                    cost_val = message_obj["cost"].get("total", 0.0)
                    msg_data["cost_total"] = cost_val
                    total_cost += cost_val

                message_count += 1

            elif role == "toolResult":
                msg_data.update({
                    "tool_call_id": message_obj.get("toolCallId", ""),
                    "tool_name": message_obj.get("toolName", ""),
                    "is_error": message_obj.get("isError", False),
                    "exit_code": message_obj.get("exitCode"),
                    "duration_ms": message_obj.get("durationMs"),
                })
                if message_obj.get("isError"):
                    error_count += 1
                if message_obj.get("toolCallId"):
                    tool_results[message_obj["toolCallId"]] = {
                        "result_text": content_text,
                        "is_error": message_obj.get("isError", False),
                        "exit_code": message_obj.get("exitCode"),
                        "duration_ms": message_obj.get("durationMs"),
                    }

            messages.append(msg_data)
            message_seq += 1

            tc_seq = 0
            for tc in tc_list:
                tool_call_data = {
                    "session_id": session_id,
                    "message_id": msg_id,
                    "tool_call_id": tc.get("id", f"call_{msg_id}_{tc_seq}"),
                    "tool_name": tc.get("name", "unknown"),
                    "arguments": tc.get("arguments", {}),
                    "seq": tc_seq,
                }
                tr = tool_results.get(tool_call_data["tool_call_id"], {})
                tool_call_data["result_text"] = tr.get("result_text", "")
                tool_call_data["is_error"] = tr.get("is_error", False)
                tool_call_data["exit_code"] = tr.get("exit_code")
                tool_call_data["duration_ms"] = tr.get("duration_ms")
                tool_calls.append(tool_call_data)
                tool_call_count += 1
                tc_seq += 1

    start_time = timestamps[0] if timestamps else session_timestamp
    end_time = timestamps[-1] if timestamps else session_timestamp

    session_data = {
        "session_id": session_id,
        "session_version": session_version,
        "model_provider": current_model_provider,
        "model_id": current_model_id,
        "thinking_level": current_thinking_level,
        "start_time": start_time,
        "end_time": end_time,
        "cwd": session_cwd,
        "total_tokens": total_tokens,
        "total_cost": total_cost,
        "message_count": message_count,
        "tool_call_count": tool_call_count,
        "error_count": error_count,
        "raw_file_path": str(file_path),
        "status": "active",
        "metadata": {},
    }

    sessions.append(session_data)
    return sessions, messages, tool_calls


# ─────────────────────────────────────────────────────────────────
# Cron Sync Mode
# ─────────────────────────────────────────────────────────────────

def get_cron_state(state_file_path):
    """Read cron sync state, return {run_file: mtime}."""
    p = Path(state_file_path)
    if not p.exists():
        return {}
    try:
        with open(p) as f:
            return json.load(f)
    except (json.JSONDecodeError, IOError):
        return {}


def save_cron_state(state_file_path, state):
    """Write cron sync state."""
    p = Path(state_file_path)
    with open(p, "w") as f:
        json.dump(state, f)


def sync_cron_jobs(args):
    """Sync cron jobs from openclaw cron data."""
    ssh_host = args.cron_ssh
    jobs_path = args.cron_jobs_path
    runs_path = args.cron_runs_path.rstrip("/")

    print(f"Fetching jobs.json from {ssh_host}:{jobs_path}...")
    try:
        jobs_raw = ssh_read_file(ssh_host, jobs_path)
        jobs_data = json.loads(jobs_raw)
    except Exception as e:
        print(f"ERROR reading jobs.json: {e}")
        return

    jobs = jobs_data.get("jobs", [])
    job_ids = {j["id"] for j in jobs}
    print(f"  Found {len(jobs)} jobs")

    # Find runs files, filter to only those matching known job IDs
    print(f"Scanning runs directory {ssh_host}:{runs_path}/...")
    all_run_files = ssh_list_files(ssh_host, runs_path, "*.jsonl")
    run_files = [f for f in all_run_files if Path(f).stem in job_ids]
    print(f"  Found {len(run_files)} run files matching known job IDs")

    # Load sync state
    state_file = Path.home() / ".sync_cron_state"
    prev_state = get_cron_state(str(state_file))

    new_runs = []
    new_state = {}

    # Detect remote platform (Linux vs macOS) for stat syntax
    uname_result = subprocess.run(
        ["ssh", ssh_host, "uname"],
        capture_output=True, text=True, timeout=10,
    )
    is_macos = uname_result.stdout.strip() == "Darwin"
    stat_cmd = f"stat -f %m" if is_macos else f"stat -c %Y"

    for run_file in run_files:
        remote_full = f"{runs_path}/{Path(run_file).name}"
        # Get mtime via SSH
        result = subprocess.run(
            ["ssh", ssh_host, f"{stat_cmd} {remote_full}"],
            capture_output=True, text=True, timeout=10,
        )
        if result.returncode != 0:
            continue
        try:
            mtime = int(result.stdout.strip())
        except ValueError:
            continue

        old_mtime = prev_state.get(remote_full, 0)
        if mtime > old_mtime:
            new_runs.append(remote_full)
        new_state[remote_full] = mtime

    if not new_runs:
        print("No new or modified run files found.")
        save_cron_state(str(state_file), new_state)
        return

    print(f"Parsing {len(new_runs)} new/modified run file(s)...")

    all_runs = []
    for run_file in new_runs:
        print(f"  Parsing: {run_file}")
        try:
            raw = ssh_read_file(ssh_host, run_file)
            for line in raw.strip().split("\n"):
                line = line.strip()
                if not line:
                    continue
                try:
                    run_obj = json.loads(line)
                    all_runs.append(run_obj)
                except json.JSONDecodeError:
                    continue
        except Exception as e:
            print(f"  ERROR reading {run_file}: {e}")
            continue

    if not all_runs:
        print("No run records parsed.")
        save_cron_state(str(state_file), new_state)
        return

    # Save new state
    save_cron_state(str(state_file), new_state)

    payload = {
        "source_node": os.environ.get("SOURCE_NODE", ssh_host),
        "jobs": jobs,
        "runs": all_runs,
    }

    print(f"Pushing {len(jobs)} jobs and {len(all_runs)} runs to {args.remote_url}...")
    try:
        result = push_to_api(args.remote_url, payload)
        print(f"  OK: jobs_upserted={result.get('jobs_upserted', 0)}, "
              f"runs_upserted={result.get('runs_upserted', 0)}")
    except Exception as e:
        print(f"  FAILED to push cron data: {e}")


# ─────────────────────────────────────────────────────────────────
# HTTP Client
# ─────────────────────────────────────────────────────────────────

def push_to_api(remote_url, payload):
    """POST structured JSON to Django API."""
    data = json.dumps(payload).encode("utf-8")
    req = urllib.request.Request(
        remote_url,
        data=data,
        headers={"Content-Type": "application/json"},
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=120) as resp:
            return json.loads(resp.read())
    except urllib.error.HTTPError as e:
        print(f"HTTP Error {e.code}: {e.read().decode('utf-8', errors='replace')}")
        raise
    except urllib.error.URLError as e:
        print(f"URL Error: {e.reason}")
        raise
    except Exception as e:
        print(f"Error: {e}")
        raise


# ─────────────────────────────────────────────────────────────────
# Main
# ─────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="Sync OpenClaw sessions or cron data to Django API")
    parser.add_argument(
        "--remote-url",
        required=True,
        help="Django API bulk_upsert endpoint URL",
    )
    parser.add_argument(
        "--root-path",
        default=".",
        help="Root path containing agents/ directory (default: current dir)",
    )
    parser.add_argument(
        "--cron",
        action="store_true",
        help="Sync cron jobs and runs instead of session files",
    )
    parser.add_argument(
        "--cron-ssh",
        default="macmini",
        help="SSH host for cron data (default: macmini)",
    )
    parser.add_argument(
        "--cron-jobs-path",
        default="/Users/weishen/openclaw/cron/jobs.json",
        help="Remote path to jobs.json",
    )
    parser.add_argument(
        "--cron-runs-path",
        default="/Users/weishen/openclaw/cron/runs/",
        help="Remote directory containing run JSONL files",
    )

    args = parser.parse_args()

    if args.cron:
        sync_cron_jobs(args)
        return

    # Original session sync mode
    new_files = get_new_files(args.root_path)
    if not new_files:
        print("No new or modified session files found.")
        return

    print(f"Found {len(new_files)} new/modified session(s).")

    total_sessions = 0
    total_messages = 0
    total_tool_calls = 0

    agent_batches = {}
    for agent_name, jsonl_path in new_files:
        agent_batches.setdefault(agent_name, []).append(jsonl_path)

    for agent_name, file_paths in agent_batches.items():
        all_sessions = []
        all_messages = []
        all_tool_calls = []

        for fp in file_paths:
            print(f"  Parsing: {fp}")
            try:
                sessions, messages, tool_calls = parse_jsonl(fp)
                all_sessions.extend(sessions)
                all_messages.extend(messages)
                all_tool_calls.extend(tool_calls)
            except Exception as e:
                print(f"  ERROR parsing {fp}: {e}")
                continue

        if not all_sessions:
            continue

        payload = {
            "agent_name": agent_name,
            "source_node": os.environ.get("SOURCE_NODE", "unknown"),
            "sessions": all_sessions,
            "messages": all_messages,
            "tool_calls": all_tool_calls,
        }

        print(f"  Pushing {len(all_sessions)} session(s), "
              f"{len(all_messages)} message(s), "
              f"{len(all_tool_calls)} tool call(s)...")

        try:
            result = push_to_api(args.remote_url, payload)
            print(f"  OK: {result}")
            total_sessions += result.get("sessions_upserted", 0)
            total_messages += result.get("messages_upserted", 0)
            total_tool_calls += result.get("tool_calls_upserted", 0)
        except Exception:
            print(f"  FAILED to push {agent_name} sessions.")

    print(f"\nSync complete: {total_sessions} sessions, "
          f"{total_messages} messages, {total_tool_calls} tool calls pushed.")


if __name__ == "__main__":
    main()