Source code for config

"""Central runtime configuration for RePORT AI Portal.

**What.** All path constants, environment-variable resolution, study
detection, LLM provider inference, staging-directory management,
and directory creation in one place.

**Why.** 138 call sites across the pipeline, agent, UI, and test suite
use ``import config`` — a single canonical location prevents scattered
``os.getenv`` and ``Path(...)`` literals throughout the codebase.

**How.** All values are resolved at import time. ``STUDY_NAME`` is
determined by the ``$STUDY_NAME`` env var or a filesystem scan of
``data/raw/``. LLM provider is inferred from model-name prefix unless
overridden by ``$LLM_PROVIDER``. Staging directories are NOT created
eagerly; call :func:`ensure_directories` after startup.
"""

# config.py
from __future__ import annotations

import logging
import os
from pathlib import Path
from typing import Any, overload

import yaml

# ----------------------------------------------------------------------------
# ENV HELPERS (centralized, validated access)
# ----------------------------------------------------------------------------


@overload
def _get_env(key: str, default: str) -> str: ...
@overload
def _get_env(key: str, default: None = None) -> str | None: ...
def _get_env(key: str, default: str | None = None) -> str | None:
    value = os.getenv(key)
    return default if value is None or value == "" else value


def _get_env_int(key: str, default: int) -> int:
    raw = _get_env(key)
    if raw is None:
        return default
    try:
        return int(raw)
    except ValueError as exc:
        raise ValueError(f"{key} must be an integer") from exc


def _get_env_bool(key: str, default: bool) -> bool:
    value = str(_get_env(key, str(default))).lower()
    return value in {"1", "true", "yes", "on"}



[docs]
def production_mode_enabled() -> bool:
    """Return True when production controls should fail closed."""

    return (
        _get_env_bool("REPORT_AI_PRODUCTION", False)
        or _get_env_bool("REPORT_AI_REQUIRE_PHI_LOG_REDACTOR", False)
        or str(_get_env("REPORT_AI_AUTH_MODE", "")).strip().lower() == "proxy"
    )




[docs]
def strict_study_detection_enabled() -> bool:
    """Return True when missing auto-detected study data should abort import."""

    return _get_env_bool("REPORT_AI_STRICT_STUDY_DETECTION", False)



# ----------------------------------------------------------------------------
# YAML CONFIG (config/config.yaml — optional overlay)
# ----------------------------------------------------------------------------

CONFIG_YAML_PATH = Path(__file__).resolve().parent / "config" / "config.yaml"


def _load_yaml_config() -> dict[str, Any]:
    """Load config.yaml if it exists; return empty dict otherwise."""
    if CONFIG_YAML_PATH.is_file():
        with CONFIG_YAML_PATH.open() as fh:
            data = yaml.safe_load(fh)
            return data if isinstance(data, dict) else {}
    return {}


_YAML_CFG: dict[str, Any] = _load_yaml_config()



[docs]
def yaml_get(*keys: str, default: Any = None) -> Any:
    """Retrieve a nested key from the loaded YAML config.

    >>> yaml_get("app", "log_level", default="INFO")
    'INFO'
    """
    node: Any = _YAML_CFG
    for k in keys:
        if isinstance(node, dict):
            node = node.get(k)
        else:
            return default
    return node if node is not None else default



# ----------------------------------------------------------------------------
# VERSION
# ----------------------------------------------------------------------------

try:
    from __version__ import __version__
except ImportError:
    __version__ = "0.0.0"

DEFAULT_DATASET_NAME = "Indo-VAP"
DEFAULT_LOG_LEVEL = "INFO"

LOG_NAME = "report_ai_portal"
LOG_LEVEL = _get_env("LOG_LEVEL", yaml_get("app", "log_level", default=DEFAULT_LOG_LEVEL))
logger = logging.getLogger(LOG_NAME)


# ----------------------------------------------------------------------------
# BASE PATHS
# ----------------------------------------------------------------------------

BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
DATA_DIR = BASE_DIR / "data"
RAW_DATA_DIR = DATA_DIR / "raw"

OUTPUT_DIR = BASE_DIR / "output"
LOGS_DIR = BASE_DIR / ".logs"
TMP_DIR = BASE_DIR / "tmp"


# ----------------------------------------------------------------------------
# STUDY DETECTION
# ----------------------------------------------------------------------------



[docs]
def detect_study_name(*, strict: bool | None = None) -> str:
    strict = strict_study_detection_enabled() if strict is None else strict
    if not RAW_DATA_DIR.exists():
        msg = f"RAW_DATA_DIR missing: {RAW_DATA_DIR}"
        if strict:
            raise RuntimeError(msg)
        logger.warning("%s → using default: %s", msg, DEFAULT_DATASET_NAME)
        return DEFAULT_DATASET_NAME

    try:
        exclude = {".backup", ".DS_Store", "output"}

        candidates = [
            p.name
            for p in RAW_DATA_DIR.iterdir()
            if p.is_dir() and not p.name.startswith(".") and p.name not in exclude
        ]

        for candidate in sorted(candidates):
            if (RAW_DATA_DIR / candidate / "datasets").is_dir():
                return candidate

        msg = f"No valid study found under {RAW_DATA_DIR}"
        if strict:
            raise RuntimeError(msg)
        logger.warning("%s → using default: %s", msg, DEFAULT_DATASET_NAME)
        return DEFAULT_DATASET_NAME

    except OSError as exc:
        if strict:
            raise RuntimeError(f"Study detection failed under {RAW_DATA_DIR}") from exc
        logger.warning("Study detection failed → fallback to default", exc_info=True)
        return DEFAULT_DATASET_NAME



# ENV override ALWAYS wins
_STUDY_NAME_ENV = _get_env("STUDY_NAME")
if _STUDY_NAME_ENV:
    if "/" in _STUDY_NAME_ENV or "\\" in _STUDY_NAME_ENV or _STUDY_NAME_ENV in {".", ".."}:
        raise ValueError("STUDY_NAME must be a plain folder name, not a path")
    STUDY_NAME = _STUDY_NAME_ENV
else:
    STUDY_NAME = detect_study_name()


# ----------------------------------------------------------------------------
# STUDY PATHS
# ----------------------------------------------------------------------------

STUDY_DATA_DIR = RAW_DATA_DIR / STUDY_NAME
STUDY_OUTPUT_DIR = OUTPUT_DIR / STUDY_NAME

# Raw study subdirectories (under data/raw/<study>/)
DATASETS_DIR = STUDY_DATA_DIR / "datasets"
ANNOTATED_PDFS_DIR = STUDY_DATA_DIR / "annotated_pdfs"
DATA_DICTIONARY_DIR = STUDY_DATA_DIR / "data_dictionary"

# Trio bundle is the single consolidated clean-output tree.
# Everything that was formerly split across clean/jsonl/* now lives here.
TRIO_BUNDLE_DIR = STUDY_OUTPUT_DIR / "trio_bundle"
TRIO_DATASETS_DIR = TRIO_BUNDLE_DIR / "datasets"

STUDY_AUDIT_DIR = STUDY_OUTPUT_DIR / "audit"

# Audit-report paths (written by the cleanup/dedup pipeline).
# Only the dataset leg produces audit reports — dictionary and PDF legs carry
# no PHI, so their cleanup is side-effect-only (pruning without a report).
# Step-cache manifests for dataset_processing also land under STUDY_AUDIT_DIR
# so the LLM-visible trio_bundle/ stays content-only.
AUDIT_DATASET_REPORT_PATH: Path = STUDY_AUDIT_DIR / "dataset_cleanup_report.json"
AUDIT_SCRUB_REPORT_PATH: Path = STUDY_AUDIT_DIR / "phi_scrub_report.json"

# Dictionary and PDF extraction artifacts live inside the trio bundle so
# the LLM agent can read them alongside the canonical datasets.
DICTIONARY_JSON_OUTPUT_DIR = TRIO_BUNDLE_DIR / "dictionary"
PDF_EXTRACTIONS_DIR = TRIO_BUNDLE_DIR / "pdfs"

# ----------------------------------------------------------------------------
# AGENT STATE TIER (per-session state, NOT study output)
# ----------------------------------------------------------------------------
# Per-session, agent-owned operational state — analysis runs, conversation
# transcripts. Telemetry lives under STUDY_AUDIT_DIR so the LLM's permitted
# agent/** zone stays free of operator-audit bytes. Everything inside the
# fully-gitignored ``output/`` tree keeps PHI-scrubbed cohort bytes out of
# git by default.
AGENT_STATE_DIR: Path = STUDY_OUTPUT_DIR / "agent"
AGENT_OUTPUT_DIR: Path = AGENT_STATE_DIR / "analysis"
CONVERSATIONS_DIR: Path = AGENT_STATE_DIR / "conversations"

# ----------------------------------------------------------------------------
# SNAPSHOT TIER (human-reviewed baseline; LLM-INVISIBLE)
# ----------------------------------------------------------------------------
# Snapshots are separate from raw data, agent state, and the live trio bundle.
# They hold a cleaned-and-verified trio bundle (datasets / dictionary / pdfs /
# variables.json) saved by a human after review. When PDF extraction fails or
# the operator clicks "Use Existing Study", this baseline is copied over the
# live ``output/{STUDY}/trio_bundle/``.
#
# Read posture:
#   - The pipeline reads this baseline when fresh PDF extraction cannot
#     produce a complete bundle.
#   - The wizard's "Use Existing Study" path restores this baseline over the
#     live trio bundle before chat starts.
#   - The LLM
#     agent's read zone is restricted to ``trio_bundle/`` + ``agent/``;
#     ``data/snapshots/`` is intentionally outside that zone so the LLM cannot
#     accidentally read a stale baseline as if it were live data.
#
# Path: under ``data/`` because it is reviewed study data, not runtime output.
STUDY_SNAPSHOTS_DIR: Path = DATA_DIR / "snapshots" / STUDY_NAME

# Staging workspace — per-study tree inside TMP_DIR. Managed per-run by
# main.py's _prepare_staging() / _publish_staging(); NOT created eagerly by
# ensure_directories() so a stale workspace from a crashed previous run is
# always purged explicitly before reuse.
STUDY_STAGING_DIR: Path = TMP_DIR / STUDY_NAME
STAGING_DATASETS_DIR: Path = STUDY_STAGING_DIR / "datasets"
STAGING_DICTIONARY_DIR: Path = STUDY_STAGING_DIR / "dictionary"
STAGING_PDFS_DIR: Path = STUDY_STAGING_DIR / "pdfs"

# Unified variables reference (built from dictionary + PDF extractions)
VARIABLES_JSON_PATH: Path = TRIO_BUNDLE_DIR / "variables.json"
# ----------------------------------------------------------------------------
# PHI SCRUB
# ----------------------------------------------------------------------------
# Narrow PHI handling: per-subject deterministic date jitter (SANT method) +
# HMAC-SHA256 ID pseudonymization. See scripts/security/phi_scrub.py.
#
# Config file lives alongside the module so study-specific regex patterns can
# be edited without touching code.
PHI_SCRUB_CONFIG_PATH: Path = BASE_DIR / "scripts" / "security" / "phi_scrub.yaml"


def _phi_key_path() -> Path:
    """Resolve the sidecar PHI HMAC key path.

    Uses ``$XDG_CONFIG_HOME/report_ai_portal/phi_key`` when the env var is set,
    otherwise falls back to ``~/.config/report_ai_portal/phi_key``. The key lives
    OUTSIDE the repo tree and is never read by the agent or committed to git.
    """
    xdg = os.getenv("XDG_CONFIG_HOME")
    base = Path(xdg) if xdg else Path.home() / ".config"
    return base / "report_ai_portal" / "phi_key"


PHI_KEY_PATH: Path = _phi_key_path()


# ----------------------------------------------------------------------------
# EXTRACTION CONFIG (centralized — used by all extraction modules)
# ----------------------------------------------------------------------------

# Temporary-file prefixes for atomic writes.  Each module uses its own prefix
# so crash-leftover temp files can be attributed to their source.
TEMP_PREFIX_DATASET: str = "report_ai_portal_dataset_"
TEMP_PREFIX_PDF: str = "report_ai_portal_pdf_extract_"
TEMP_PREFIX_DICT: str = "report_ai_portal_dict_"
TEMP_PREFIX_TRIO_BUNDLE: str = "report_ai_portal_trio_bundle_"
TEMP_PREFIX_DEDUP: str = "report_ai_portal_dedup_"

# Secure temp workspace — the prefix is intentionally generic+randomised so
# the directory name leaks no information about what pipeline stage created it.
SECURE_TEMP_PREFIX: str = "rpln_"

# PDF extraction — rate-limit settings
PDF_EXTRACTION_INTER_DELAY: float = float(_get_env("PDF_INTER_DELAY", "10.0"))
PDF_EXTRACTION_MAX_TOKENS: int = _get_env_int("PDF_MAX_TOKENS", 64000)

# Duplicate-column detection regex for dataset extraction
DUPLICATE_COLUMN_PATTERN: str = r"^(.+?)_?(\d+)$"


# ----------------------------------------------------------------------------
# LLM PROVIDER INFERENCE
# ----------------------------------------------------------------------------


def _infer_provider(model_name: str) -> str:
    """Infer LangChain provider string from model name prefix.

    Recognised patterns:
        llama*, mistral*, phi*, gemma*, qwen* (incl. qwen3:8b), deepseek*,
        codellama*, tinyllama*, vicuna*, falcon*, orca*  → "ollama"
        claude*                               → "anthropic"
        gpt-*, o1*, o3*, o4*, text-davinci*   → "openai"
        gemini*                               → "google-genai"

    Falls back to ``"ollama"`` (local inference, no API key needed).
    """
    m = model_name.lower()
    _ollama_prefixes = (
        "llama",
        "mistral",
        "phi3",
        "phi-3",
        "gemma",
        "qwen",
        "deepseek",
        "codellama",
        "tinyllama",
        "vicuna",
        "falcon",
        "orca",
    )
    if m.startswith(_ollama_prefixes):
        return "ollama"
    if m.startswith("claude"):
        return "anthropic"
    if m.startswith(("gpt-", "o1", "o3", "o4", "text-davinci")):
        return "openai"
    if m.startswith("gemini"):
        return "google-genai"
    # NVIDIA-hosted models use "org/model" slug format
    _nvidia_orgs = ("moonshotai/", "nvidia/", "mistralai/", "deepseek-ai/", "qwen/", "meta/")
    if any(m.startswith(prefix) for prefix in _nvidia_orgs):
        return "nvidia-ai-endpoints"
    return "ollama"  # safe default — local inference, no key needed


LLM_MODEL = _get_env("LLM_MODEL", yaml_get("ai_assistant", "llm_model", default="qwen3:8b"))
# LLM_PROVIDER: explicit env var wins; otherwise infer from model name.
LLM_PROVIDER: str = _get_env("LLM_PROVIDER") or _infer_provider(LLM_MODEL)

# Qwen3 downgrade ladder — descending parameter count. When Ollama refuses
# a rung with "requires more system memory", _init_llm walks this list to
# find the largest rung that actually loads. Only applies to qwen3:* models;
# other models (Claude, GPT, custom Ollama) pass through unchanged.
QWEN3_DOWNGRADE_LADDER: tuple[str, ...] = ("qwen3:8b", "qwen3:4b", "qwen3:1.7b")



[docs]
def preferred_or_installed_downgrade(model: str) -> list[str]:
    """Return the sequence of model names to try starting at ``model``.

    For qwen3 rungs in :data:`QWEN3_DOWNGRADE_LADDER`, returns the ladder
    from the given rung downward. For any other model, returns a one-element
    list — we only auto-step qwen3 because the three rungs are behaviourally
    compatible (same family, same tool-use format, same thinking convention).
    """
    if model in QWEN3_DOWNGRADE_LADDER:
        start = QWEN3_DOWNGRADE_LADDER.index(model)
        return list(QWEN3_DOWNGRADE_LADDER[start:])
    return [model]



# ----------------------------------------------------------------------------
# AI Assistant / AGENT
# ----------------------------------------------------------------------------

# Telemetry lives under STUDY_AUDIT_DIR (not AGENT_STATE_DIR) to keep the
# LLM's permitted agent/** zone clear of operator-audit bytes. Per the PHI
# rule, LLM must never read telemetry; parking it under audit/ — the same
# zone that holds phi_scrub_report.json and dataset_cleanup_report.json —
# makes that boundary structural, not a per-file carve-out.
TELEMETRY_DIR = STUDY_AUDIT_DIR / "telemetry"
TELEMETRY_SINK = TELEMETRY_DIR / "events.jsonl"

# Chat / agent
AGENT_MAX_TOKENS: int = _get_env_int("AGENT_MAX_TOKENS", 16384)
AGENT_TIMEOUT: int = _get_env_int("AGENT_TIMEOUT", 300)
CHAT_RATE_LIMIT_WINDOW_SECONDS: int = _get_env_int("CHAT_RATE_LIMIT_WINDOW_SECONDS", 60)
CHAT_RATE_LIMIT_MAX_TURNS: int = _get_env_int("CHAT_RATE_LIMIT_MAX_TURNS", 12)
# Watchdog on the agent stream: raise TimeoutError if no chunk is produced
# for this many seconds. Measures inter-chunk idle time, NOT total wall
# clock — so slow-but-steady streams (long tool runs) stay alive. The E3
# benchmark stall went 6+ minutes of total silence with no stop signal;
# 180s is ~10x the p99 of a healthy routing step.
AGENT_STREAM_IDLE_TIMEOUT: int = _get_env_int("AGENT_STREAM_IDLE_TIMEOUT", 180)

# Analytical engine limits
ANALYSIS_TIMEOUT: int = _get_env_int("ANALYSIS_TIMEOUT", 300)
ANALYSIS_MAX_OUTPUT: int = _get_env_int("ANALYSIS_MAX_OUTPUT", 200_000)
ANALYSIS_MAX_FIGURES: int = _get_env_int("ANALYSIS_MAX_FIGURES", 20)

# Sandbox subprocess limits — operational tunables (safe to lower; lowering
# only tightens the security envelope). The trust boundary itself
# (import allowlist, env-var blocklist, AST guards) is hardcoded in
# ``scripts.ai_assistant.sandbox`` and is not configurable from here.
#
# Defaults sized for production runs of the typical pandas+numpy+plotly
# stack: numpy alone reserves ~700 MB of address space on Linux when loaded
# (RLIMIT_AS is whole-vmap, not RSS). RLIMIT_NPROC is per-user not per-tree
# on Linux, so a small cap conflicts with whatever else the host user is
# running — 4096 is high enough to coexist with shared CI environments
# while still preventing runaway fork bombs.
SANDBOX_MAX_MEMORY_MB: int = _get_env_int("SANDBOX_MAX_MEMORY_MB", 2048)
SANDBOX_MAX_PROCS: int = _get_env_int("SANDBOX_MAX_PROCS", 4096)
SANDBOX_MAX_FILES: int = _get_env_int("SANDBOX_MAX_FILES", 256)
SANDBOX_PERSIST_CODE: bool = _get_env("SANDBOX_PERSIST_CODE", "true").lower() in (
    "1",
    "true",
    "yes",
    "on",
)

# Orchestration mode: "auto" | "single-agent" | "multi-agent"
AGENT_ORCHESTRATION_MODE: str = _get_env(
    "AGENT_ORCHESTRATION_MODE",
    yaml_get("ai_assistant", "agent", "orchestration_mode", default="auto"),
)

# Enforce LangChain tracing OFF by default (privacy-first)
os.environ.setdefault("LANGCHAIN_TRACING_V2", "false")


# ----------------------------------------------------------------------------
# DIRECTORY CREATION
# ----------------------------------------------------------------------------



[docs]
def ensure_directories() -> None:
    """Create runtime directories. Sensitive dirs (containing PHI-scrubbed
    data, agent state, conversations, audit, or logs) are
    hardened to mode 0o700 after creation so they're not world-readable
    under the typical umask 0o022. Dirs that may legitimately need group
    access (``OUTPUT_DIR`` parent, ``TMP_DIR`` is already 0o700 via
    secure-staging) are left at default mode."""
    sensitive_paths = [
        STUDY_OUTPUT_DIR,
        LOGS_DIR,
        TRIO_BUNDLE_DIR,
        TRIO_DATASETS_DIR,
        DICTIONARY_JSON_OUTPUT_DIR,
        PDF_EXTRACTIONS_DIR,
        STUDY_AUDIT_DIR,
        AGENT_STATE_DIR,
        AGENT_OUTPUT_DIR,
        CONVERSATIONS_DIR,
        TELEMETRY_DIR,
        # NOTE: ``STUDY_SNAPSHOTS_DIR`` is intentionally NOT created here.
        # It is a human-reviewed baseline under ``data/snapshots/{STUDY}/``.
        # Auto-creating it would hide the absence of a reviewed fallback.
    ]
    for path in [OUTPUT_DIR, TMP_DIR, *sensitive_paths]:
        path.mkdir(parents=True, exist_ok=True)
    import contextlib

    for path in sensitive_paths:
        # Best-effort: a chmod failure (e.g., not the file owner) is not a
        # fatal startup error.
        with contextlib.suppress(OSError):
            path.chmod(0o700)



# ----------------------------------------------------------------------------
# VALIDATION
# ----------------------------------------------------------------------------



[docs]
def validate_config() -> None:
    # --- PATH VALIDATION ---
    required_paths = [
        RAW_DATA_DIR,
        STUDY_DATA_DIR,
        DATASETS_DIR,
        DATA_DICTIONARY_DIR,
    ]

    for path in required_paths:
        if not path.exists():
            raise FileNotFoundError(f"Missing required path: {path}")

    # PDF source is optional — the pipeline handles its absence gracefully
    if not ANNOTATED_PDFS_DIR.exists():
        logger.warning(
            "Annotated PDFs directory not found: %s — PDF extraction will be skipped",
            ANNOTATED_PDFS_DIR,
        )

    # Ensure the dictionary directory contains at least one file
    if DATA_DICTIONARY_DIR.is_dir() and not any(DATA_DICTIONARY_DIR.iterdir()):
        raise FileNotFoundError(f"Dictionary directory is empty: {DATA_DICTIONARY_DIR}")

    # --- LOG FINAL STATE ---
    logger.info(
        "Config loaded | study=%s",
        STUDY_NAME,
    )