Source code for scripts.ai_assistant.file_access

"""Agent-world file-access boundary enforcement.

The production LLM agent's permitted zones are (2026-04-24 boundary design):

* **Read** — ``TRIO_BUNDLE_DIR`` (scrubbed, k-anon-gated trio outputs) **or**
  ``AGENT_STATE_DIR`` (its own analysis outputs and conversations).
  A small allowlist admits read-only source-tree config files
  (``config/study_knowledge.yaml``) that tool implementations need.
* **Write** — ``AGENT_STATE_DIR`` only.

Everything else — ``STUDY_AUDIT_DIR`` (incl. telemetry), ``RAW_DATA_DIR``,
``LOGS_DIR``, ``STUDY_STAGING_DIR``, arbitrary filesystem paths — is
hard-rejected with :class:`ZoneViolationError` (a ``PermissionError``
subclass from ``scripts.security.secure_env``).

This module is the chokepoint: every agent-tool file read or write should
call :func:`validate_agent_read` or :func:`validate_agent_write` before
touching disk. The existing ``assert_trio_bundle_zone`` and
``assert_output_zone`` in ``scripts.security.secure_env`` remain valid
narrower checks — this module layers the expanded agent-runtime zone on
top without changing pipeline-side enforcement.
"""

from __future__ import annotations

import os
from pathlib import Path

import config
from scripts.security.secure_env import ZoneViolationError

__all__ = [
    "ZoneViolationError",
    "is_agent_readable",
    "validate_agent_read",
    "validate_agent_write",
    "validate_sandbox_write",
]


def _resolve(p: str | Path) -> str:
    return os.path.realpath(str(p))


def _is_within(path_realpath: str, base_realpath: str) -> bool:
    """Return True when *path_realpath* is the same as or under *base_realpath*.

    Both arguments must already be ``os.path.realpath``-resolved.
    """
    try:
        return os.path.commonpath([path_realpath, base_realpath]) == base_realpath
    except ValueError:
        # ValueError: paths on different drives (Windows) — never the same zone.
        return False


def _zones() -> tuple[list[str], list[str], frozenset[str]]:
    """Recompute permitted zones from current config.

    Called per-validation so that ``conftest.py`` monkeypatches of
    ``config.TRIO_BUNDLE_DIR`` / ``config.AGENT_STATE_DIR`` take effect.
    Cost is trivial (two ``realpath`` calls).
    """
    read_roots = [
        _resolve(config.TRIO_BUNDLE_DIR),
        _resolve(config.AGENT_STATE_DIR),
    ]
    write_roots = [
        _resolve(config.AGENT_STATE_DIR),
    ]
    # Repo-tracked config that StudyKnowledge + similar helpers load at
    # tool-invocation time. This is the "how" surface (per the hard PHI
    # rule), not the "what" — still inside the source tree.
    project_root = Path(__file__).resolve().parents[2]
    read_allowlist = frozenset(
        {
            _resolve(project_root / "config" / "study_knowledge.yaml"),
        }
    )
    return read_roots, write_roots, read_allowlist



[docs]
def validate_agent_read(path: str | Path) -> Path:
    """Return the resolved :class:`~pathlib.Path` if the agent may read it.

    Raises:
        ZoneViolationError: *path* is outside the agent's permitted read zones.
    """
    read_roots, _, allowlist = _zones()
    resolved = _resolve(path)
    if resolved in allowlist:
        return Path(resolved)
    for root in read_roots:
        if _is_within(resolved, root):
            return Path(resolved)
    raise ZoneViolationError(
        f"Agent read rejected — path is outside the permitted zones "
        f"(trio_bundle/ or agent/): {path}"
    )




[docs]
def validate_agent_write(path: str | Path) -> Path:
    """Return the resolved :class:`~pathlib.Path` if the agent may write it.

    Raises:
        ZoneViolationError: *path* is outside ``AGENT_STATE_DIR``.
    """
    _, write_roots, _ = _zones()
    resolved = _resolve(path)
    for root in write_roots:
        if _is_within(resolved, root):
            return Path(resolved)
    raise ZoneViolationError(
        f"Agent write rejected — path is outside the agent zone "
        f"(only output/{{STUDY}}/agent/** is writable): {path}"
    )




[docs]
def is_agent_readable(path: str | Path) -> bool:
    """Non-raising variant of :func:`validate_agent_read` for sentinel checks."""
    try:
        validate_agent_read(path)
    except ZoneViolationError:
        return False
    return True




[docs]
def validate_sandbox_write(path: str | Path) -> Path:
    """Return the resolved :class:`~pathlib.Path` if the exec_python sandbox
    may write to *path*.

    The sandbox runs LLM-generated code — a strictly narrower threat model
    than tool-code. Writes are scoped to ``AGENT_OUTPUT_DIR`` (``agent/analysis/``)
    rather than the full ``AGENT_STATE_DIR``.

    Uses ``os.path.commonpath`` (via :func:`_is_within`) so that sibling
    prefixes like ``agent/analysis_exfil`` cannot masquerade as ``analysis/``.

    Raises:
        ZoneViolationError: *path* is outside ``AGENT_OUTPUT_DIR``.
    """
    sandbox_root = _resolve(config.AGENT_OUTPUT_DIR)
    resolved = _resolve(path)
    if _is_within(resolved, sandbox_root):
        return Path(resolved)
    raise ZoneViolationError(
        f"Sandbox write denied — exec_python may only write inside agent/analysis/: {path}"
    )