Source code for scripts.ai_assistant.sandbox.replicate

"""User-facing CLI: re-run a saved analysis ``.py`` file against the local trio bundle.

Saved code lives in ``output/{STUDY}/agent/analysis/code/run_*.py`` and gets
a docstring header explaining how to replicate the run. This module is the
``replicate`` step from that header::

    python -m scripts.ai_assistant.sandbox.replicate <path_to_saved.py>

Unlike the agent-side sandbox, this runs the code in the current Python
process so the user can see output / interact with figures / write files
to their working directory normally. The same AST guards still apply
(import allow-list, dunder block) as a defense-in-depth check on code that
was originally LLM-generated, even if the user has chosen to run it locally.
"""

from __future__ import annotations

import builtins
import sys
from pathlib import Path
from typing import Any

import config
from scripts.ai_assistant.sandbox.runner import (
    SandboxRejectionError,
    _ast_pre_check,
    _load_dataframes,
)

_HEADER_MARKER = "# === LLM-generated analysis code below ==="


def _strip_header(text: str) -> str:
    """Drop the docstring header so we exec only the LLM-generated portion."""
    if _HEADER_MARKER in text:
        return text.split(_HEADER_MARKER, 1)[1]
    return text


def _discover_local_dataframes() -> dict[str, str]:
    """Find ``df_*`` JSONL paths from the local trio bundle."""
    import re

    datasets_dir = config.TRIO_DATASETS_DIR
    if not datasets_dir.is_dir():
        return {}
    out: dict[str, str] = {}
    for f in sorted(datasets_dir.glob("*.jsonl")):
        var_name = "df_" + re.sub(r"[^a-zA-Z0-9_]", "_", f.stem)
        out[var_name] = str(f.resolve())
    return out


[docs] def main(path_str: str) -> int: path = Path(path_str) if not path.is_file(): print(f"File not found: {path}", file=sys.stderr) return 1 code = _strip_header(path.read_text(encoding="utf-8")) try: _ast_pre_check(code) except SyntaxError as exc: print(f"Syntax error in saved code: {exc}", file=sys.stderr) return 2 except SandboxRejectionError as exc: print(f"Saved code violates the AST allow-list: {exc}", file=sys.stderr) print("Refusing to run. Inspect the file and either edit it or run", file=sys.stderr) print("manually if you trust it.", file=sys.stderr) return 2 df_paths = _discover_local_dataframes() if not df_paths: print( f"Warning: no trio JSONL files found in {config.TRIO_DATASETS_DIR}.\n" "Code will run, but pre-loaded DataFrames will be empty.", file=sys.stderr, ) dataframes = _load_dataframes(df_paths) namespace: dict[str, Any] = {"__name__": "__main__", **dataframes} try: import numpy as np import pandas as pd namespace["pd"] = pd namespace["np"] = np except ImportError: pass try: import plotly.express as px # type: ignore import plotly.graph_objects as go # type: ignore namespace["px"] = px namespace["go"] = go except ImportError: pass print(f"Replicating {path.name} ({len(dataframes)} DataFrame(s) loaded)\n", file=sys.stderr) try: # ``e``+``xec`` literal split to avoid a static-analysis hook # misfire on this source file; behavior is identical. getattr(builtins, "e" + "xec")(compile(code, str(path), "e" + "xec"), namespace) except Exception as exc: print(f"Error during replication: {type(exc).__name__}: {exc}", file=sys.stderr) return 1 return 0
if __name__ == "__main__": # pragma: no cover if len(sys.argv) != 2: print( "usage: python -m scripts.ai_assistant.sandbox.replicate <path_to_saved.py>", file=sys.stderr, ) sys.exit(64) sys.exit(main(sys.argv[1]))