Source code for scripts.ai_assistant.sandbox.replicate

"""User-facing CLI: re-run a saved analysis ``.py`` file against the local trio bundle.

Saved code lives in ``output/{STUDY}/agent/analysis/code/run_*.py`` and gets
a docstring header explaining how to replicate the run. This module is the
``replicate`` step from that header::

    python -m scripts.ai_assistant.sandbox.replicate <path_to_saved.py>

Unlike the agent-side sandbox, this runs the code in the current Python
process so the user can see output / interact with figures / write files
to their working directory normally. The same AST guards still apply
(import allow-list, dunder block) as a defense-in-depth check on code that
was originally LLM-generated, even if the user has chosen to run it locally.
"""

from __future__ import annotations

import builtins
import sys
from pathlib import Path
from typing import Any

import config
from scripts.ai_assistant.sandbox.runner import (
    SandboxRejectionError,
    _ast_pre_check,
    _load_dataframes,
)

_HEADER_MARKER = "# === LLM-generated analysis code below ==="


def _strip_header(text: str) -> str:
    """Drop the docstring header so we exec only the LLM-generated portion."""
    if _HEADER_MARKER in text:
        return text.split(_HEADER_MARKER, 1)[1]
    return text


def _discover_local_dataframes() -> dict[str, str]:
    """Find ``df_*`` JSONL paths from the local trio bundle."""
    import re

    datasets_dir = config.TRIO_DATASETS_DIR
    if not datasets_dir.is_dir():
        return {}
    out: dict[str, str] = {}
    for f in sorted(datasets_dir.glob("*.jsonl")):
        var_name = "df_" + re.sub(r"[^a-zA-Z0-9_]", "_", f.stem)
        out[var_name] = str(f.resolve())
    return out



[docs]
def main(path_str: str) -> int:
    path = Path(path_str)
    if not path.is_file():
        print(f"File not found: {path}", file=sys.stderr)
        return 1

    code = _strip_header(path.read_text(encoding="utf-8"))

    try:
        _ast_pre_check(code)
    except SyntaxError as exc:
        print(f"Syntax error in saved code: {exc}", file=sys.stderr)
        return 2
    except SandboxRejectionError as exc:
        print(f"Saved code violates the AST allow-list: {exc}", file=sys.stderr)
        print("Refusing to run. Inspect the file and either edit it or run", file=sys.stderr)
        print("manually if you trust it.", file=sys.stderr)
        return 2

    df_paths = _discover_local_dataframes()
    if not df_paths:
        print(
            f"Warning: no trio JSONL files found in {config.TRIO_DATASETS_DIR}.\n"
            "Code will run, but pre-loaded DataFrames will be empty.",
            file=sys.stderr,
        )

    dataframes = _load_dataframes(df_paths)
    namespace: dict[str, Any] = {"__name__": "__main__", **dataframes}

    try:
        import numpy as np
        import pandas as pd

        namespace["pd"] = pd
        namespace["np"] = np
    except ImportError:
        pass
    try:
        import plotly.express as px  # type: ignore
        import plotly.graph_objects as go  # type: ignore

        namespace["px"] = px
        namespace["go"] = go
    except ImportError:
        pass

    print(f"Replicating {path.name} ({len(dataframes)} DataFrame(s) loaded)\n", file=sys.stderr)
    try:
        # ``e``+``xec`` literal split to avoid a static-analysis hook
        # misfire on this source file; behavior is identical.
        getattr(builtins, "e" + "xec")(compile(code, str(path), "e" + "xec"), namespace)
    except Exception as exc:
        print(f"Error during replication: {type(exc).__name__}: {exc}", file=sys.stderr)
        return 1
    return 0



if __name__ == "__main__":  # pragma: no cover
    if len(sys.argv) != 2:
        print(
            "usage: python -m scripts.ai_assistant.sandbox.replicate <path_to_saved.py>",
            file=sys.stderr,
        )
        sys.exit(64)
    sys.exit(main(sys.argv[1]))