Source code for config

"""Central runtime configuration for RePORT AI Portal.

**What.** All path constants, environment-variable resolution, study
detection, LLM provider inference, staging-directory management,
and directory creation in one place.

**Why.** 138 call sites across the pipeline, agent, UI, and test suite
use ``import config`` — a single canonical location prevents scattered
``os.getenv`` and ``Path(...)`` literals throughout the codebase.

**How.** All values are resolved at import time. ``STUDY_NAME`` is
determined by the ``$STUDY_NAME`` env var or a filesystem scan of
``data/raw/``. LLM provider is inferred from model-name prefix unless
overridden by ``$LLM_PROVIDER``. Staging directories are NOT created
eagerly; call :func:`ensure_directories` after startup.
"""

# config.py
from __future__ import annotations

import logging
import os
from pathlib import Path
from typing import Any, overload

import yaml

# ----------------------------------------------------------------------------
# ENV HELPERS (centralized, validated access)
# ----------------------------------------------------------------------------


@overload
def _get_env(key: str, default: str) -> str: ...
@overload
def _get_env(key: str, default: None = None) -> str | None: ...
def _get_env(key: str, default: str | None = None) -> str | None:
    value = os.getenv(key)
    return default if value is None or value == "" else value


def _get_env_int(key: str, default: int) -> int:
    raw = _get_env(key)
    if raw is None:
        return default
    try:
        return int(raw)
    except ValueError as exc:
        raise ValueError(f"{key} must be an integer") from exc


def _get_env_bool(key: str, default: bool) -> bool:
    value = str(_get_env(key, str(default))).lower()
    return value in {"1", "true", "yes", "on"}


[docs] def production_mode_enabled() -> bool: """Return True when production controls should fail closed.""" return ( _get_env_bool("REPORT_AI_PRODUCTION", False) or _get_env_bool("REPORT_AI_REQUIRE_PHI_LOG_REDACTOR", False) or str(_get_env("REPORT_AI_AUTH_MODE", "")).strip().lower() == "proxy" )
[docs] def strict_study_detection_enabled() -> bool: """Return True when missing auto-detected study data should abort import.""" return _get_env_bool("REPORT_AI_STRICT_STUDY_DETECTION", False)
# ---------------------------------------------------------------------------- # YAML CONFIG (config/config.yaml — optional overlay) # ---------------------------------------------------------------------------- CONFIG_YAML_PATH = Path(__file__).resolve().parent / "config" / "config.yaml" def _load_yaml_config() -> dict[str, Any]: """Load config.yaml if it exists; return empty dict otherwise.""" if CONFIG_YAML_PATH.is_file(): with CONFIG_YAML_PATH.open() as fh: data = yaml.safe_load(fh) return data if isinstance(data, dict) else {} return {} _YAML_CFG: dict[str, Any] = _load_yaml_config()
[docs] def yaml_get(*keys: str, default: Any = None) -> Any: """Retrieve a nested key from the loaded YAML config. >>> yaml_get("app", "log_level", default="INFO") 'INFO' """ node: Any = _YAML_CFG for k in keys: if isinstance(node, dict): node = node.get(k) else: return default return node if node is not None else default
# ---------------------------------------------------------------------------- # VERSION # ---------------------------------------------------------------------------- try: from __version__ import __version__ except ImportError: __version__ = "0.0.0" DEFAULT_DATASET_NAME = "Indo-VAP" DEFAULT_LOG_LEVEL = "INFO" LOG_NAME = "report_ai_portal" LOG_LEVEL = _get_env("LOG_LEVEL", yaml_get("app", "log_level", default=DEFAULT_LOG_LEVEL)) logger = logging.getLogger(LOG_NAME) # ---------------------------------------------------------------------------- # BASE PATHS # ---------------------------------------------------------------------------- BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd() DATA_DIR = BASE_DIR / "data" RAW_DATA_DIR = DATA_DIR / "raw" OUTPUT_DIR = BASE_DIR / "output" LOGS_DIR = BASE_DIR / ".logs" TMP_DIR = BASE_DIR / "tmp" # ---------------------------------------------------------------------------- # STUDY DETECTION # ----------------------------------------------------------------------------
[docs] def detect_study_name(*, strict: bool | None = None) -> str: strict = strict_study_detection_enabled() if strict is None else strict if not RAW_DATA_DIR.exists(): msg = f"RAW_DATA_DIR missing: {RAW_DATA_DIR}" if strict: raise RuntimeError(msg) logger.warning("%s → using default: %s", msg, DEFAULT_DATASET_NAME) return DEFAULT_DATASET_NAME try: exclude = {".backup", ".DS_Store", "output"} candidates = [ p.name for p in RAW_DATA_DIR.iterdir() if p.is_dir() and not p.name.startswith(".") and p.name not in exclude ] for candidate in sorted(candidates): if (RAW_DATA_DIR / candidate / "datasets").is_dir(): return candidate msg = f"No valid study found under {RAW_DATA_DIR}" if strict: raise RuntimeError(msg) logger.warning("%s → using default: %s", msg, DEFAULT_DATASET_NAME) return DEFAULT_DATASET_NAME except OSError as exc: if strict: raise RuntimeError(f"Study detection failed under {RAW_DATA_DIR}") from exc logger.warning("Study detection failed → fallback to default", exc_info=True) return DEFAULT_DATASET_NAME
# ENV override ALWAYS wins _STUDY_NAME_ENV = _get_env("STUDY_NAME") if _STUDY_NAME_ENV: if "/" in _STUDY_NAME_ENV or "\\" in _STUDY_NAME_ENV or _STUDY_NAME_ENV in {".", ".."}: raise ValueError("STUDY_NAME must be a plain folder name, not a path") STUDY_NAME = _STUDY_NAME_ENV else: STUDY_NAME = detect_study_name() # ---------------------------------------------------------------------------- # STUDY PATHS # ---------------------------------------------------------------------------- STUDY_DATA_DIR = RAW_DATA_DIR / STUDY_NAME STUDY_OUTPUT_DIR = OUTPUT_DIR / STUDY_NAME # Raw study subdirectories (under data/raw/<study>/) DATASETS_DIR = STUDY_DATA_DIR / "datasets" ANNOTATED_PDFS_DIR = STUDY_DATA_DIR / "annotated_pdfs" DATA_DICTIONARY_DIR = STUDY_DATA_DIR / "data_dictionary" # Trio bundle is the single consolidated clean-output tree. # Everything that was formerly split across clean/jsonl/* now lives here. TRIO_BUNDLE_DIR = STUDY_OUTPUT_DIR / "trio_bundle" TRIO_DATASETS_DIR = TRIO_BUNDLE_DIR / "datasets" STUDY_AUDIT_DIR = STUDY_OUTPUT_DIR / "audit" # Audit-report paths (written by the cleanup/dedup pipeline). # Only the dataset leg produces audit reports — dictionary and PDF legs carry # no PHI, so their cleanup is side-effect-only (pruning without a report). # Step-cache manifests for dataset_processing also land under STUDY_AUDIT_DIR # so the LLM-visible trio_bundle/ stays content-only. AUDIT_DATASET_REPORT_PATH: Path = STUDY_AUDIT_DIR / "dataset_cleanup_report.json" AUDIT_SCRUB_REPORT_PATH: Path = STUDY_AUDIT_DIR / "phi_scrub_report.json" # Dictionary and PDF extraction artifacts live inside the trio bundle so # the LLM agent can read them alongside the canonical datasets. DICTIONARY_JSON_OUTPUT_DIR = TRIO_BUNDLE_DIR / "dictionary" PDF_EXTRACTIONS_DIR = TRIO_BUNDLE_DIR / "pdfs" # ---------------------------------------------------------------------------- # AGENT STATE TIER (per-session state, NOT study output) # ---------------------------------------------------------------------------- # Per-session, agent-owned operational state — analysis runs, conversation # transcripts. Telemetry lives under STUDY_AUDIT_DIR so the LLM's permitted # agent/** zone stays free of operator-audit bytes. Everything inside the # fully-gitignored ``output/`` tree keeps PHI-scrubbed cohort bytes out of # git by default. AGENT_STATE_DIR: Path = STUDY_OUTPUT_DIR / "agent" AGENT_OUTPUT_DIR: Path = AGENT_STATE_DIR / "analysis" CONVERSATIONS_DIR: Path = AGENT_STATE_DIR / "conversations" # ---------------------------------------------------------------------------- # SNAPSHOT TIER (human-reviewed baseline; LLM-INVISIBLE) # ---------------------------------------------------------------------------- # Snapshots are separate from raw data, agent state, and the live trio bundle. # They hold a cleaned-and-verified trio bundle (datasets / dictionary / pdfs / # variables.json) saved by a human after review. When PDF extraction fails or # the operator clicks "Use Existing Study", this baseline is copied over the # live ``output/{STUDY}/trio_bundle/``. # # Read posture: # - The pipeline reads this baseline when fresh PDF extraction cannot # produce a complete bundle. # - The wizard's "Use Existing Study" path restores this baseline over the # live trio bundle before chat starts. # - The LLM # agent's read zone is restricted to ``trio_bundle/`` + ``agent/``; # ``data/snapshots/`` is intentionally outside that zone so the LLM cannot # accidentally read a stale baseline as if it were live data. # # Path: under ``data/`` because it is reviewed study data, not runtime output. STUDY_SNAPSHOTS_DIR: Path = DATA_DIR / "snapshots" / STUDY_NAME # Staging workspace — per-study tree inside TMP_DIR. Managed per-run by # main.py's _prepare_staging() / _publish_staging(); NOT created eagerly by # ensure_directories() so a stale workspace from a crashed previous run is # always purged explicitly before reuse. STUDY_STAGING_DIR: Path = TMP_DIR / STUDY_NAME STAGING_DATASETS_DIR: Path = STUDY_STAGING_DIR / "datasets" STAGING_DICTIONARY_DIR: Path = STUDY_STAGING_DIR / "dictionary" STAGING_PDFS_DIR: Path = STUDY_STAGING_DIR / "pdfs" # Unified variables reference (built from dictionary + PDF extractions) VARIABLES_JSON_PATH: Path = TRIO_BUNDLE_DIR / "variables.json" # ---------------------------------------------------------------------------- # PHI SCRUB # ---------------------------------------------------------------------------- # Narrow PHI handling: per-subject deterministic date jitter (SANT method) + # HMAC-SHA256 ID pseudonymization. See scripts/security/phi_scrub.py. # # Config file lives alongside the module so study-specific regex patterns can # be edited without touching code. PHI_SCRUB_CONFIG_PATH: Path = BASE_DIR / "scripts" / "security" / "phi_scrub.yaml" def _phi_key_path() -> Path: """Resolve the sidecar PHI HMAC key path. Uses ``$XDG_CONFIG_HOME/report_ai_portal/phi_key`` when the env var is set, otherwise falls back to ``~/.config/report_ai_portal/phi_key``. The key lives OUTSIDE the repo tree and is never read by the agent or committed to git. """ xdg = os.getenv("XDG_CONFIG_HOME") base = Path(xdg) if xdg else Path.home() / ".config" return base / "report_ai_portal" / "phi_key" PHI_KEY_PATH: Path = _phi_key_path() # ---------------------------------------------------------------------------- # EXTRACTION CONFIG (centralized — used by all extraction modules) # ---------------------------------------------------------------------------- # Temporary-file prefixes for atomic writes. Each module uses its own prefix # so crash-leftover temp files can be attributed to their source. TEMP_PREFIX_DATASET: str = "report_ai_portal_dataset_" TEMP_PREFIX_PDF: str = "report_ai_portal_pdf_extract_" TEMP_PREFIX_DICT: str = "report_ai_portal_dict_" TEMP_PREFIX_TRIO_BUNDLE: str = "report_ai_portal_trio_bundle_" TEMP_PREFIX_DEDUP: str = "report_ai_portal_dedup_" # Secure temp workspace — the prefix is intentionally generic+randomised so # the directory name leaks no information about what pipeline stage created it. SECURE_TEMP_PREFIX: str = "rpln_" # PDF extraction — rate-limit settings PDF_EXTRACTION_INTER_DELAY: float = float(_get_env("PDF_INTER_DELAY", "10.0")) PDF_EXTRACTION_MAX_TOKENS: int = _get_env_int("PDF_MAX_TOKENS", 64000) # Duplicate-column detection regex for dataset extraction DUPLICATE_COLUMN_PATTERN: str = r"^(.+?)_?(\d+)$" # ---------------------------------------------------------------------------- # LLM PROVIDER INFERENCE # ---------------------------------------------------------------------------- def _infer_provider(model_name: str) -> str: """Infer LangChain provider string from model name prefix. Recognised patterns: llama*, mistral*, phi*, gemma*, qwen* (incl. qwen3:8b), deepseek*, codellama*, tinyllama*, vicuna*, falcon*, orca* → "ollama" claude* → "anthropic" gpt-*, o1*, o3*, o4*, text-davinci* → "openai" gemini* → "google-genai" Falls back to ``"ollama"`` (local inference, no API key needed). """ m = model_name.lower() _ollama_prefixes = ( "llama", "mistral", "phi3", "phi-3", "gemma", "qwen", "deepseek", "codellama", "tinyllama", "vicuna", "falcon", "orca", ) if m.startswith(_ollama_prefixes): return "ollama" if m.startswith("claude"): return "anthropic" if m.startswith(("gpt-", "o1", "o3", "o4", "text-davinci")): return "openai" if m.startswith("gemini"): return "google-genai" # NVIDIA-hosted models use "org/model" slug format _nvidia_orgs = ("moonshotai/", "nvidia/", "mistralai/", "deepseek-ai/", "qwen/", "meta/") if any(m.startswith(prefix) for prefix in _nvidia_orgs): return "nvidia-ai-endpoints" return "ollama" # safe default — local inference, no key needed LLM_MODEL = _get_env("LLM_MODEL", yaml_get("ai_assistant", "llm_model", default="qwen3:8b")) # LLM_PROVIDER: explicit env var wins; otherwise infer from model name. LLM_PROVIDER: str = _get_env("LLM_PROVIDER") or _infer_provider(LLM_MODEL) # Qwen3 downgrade ladder — descending parameter count. When Ollama refuses # a rung with "requires more system memory", _init_llm walks this list to # find the largest rung that actually loads. Only applies to qwen3:* models; # other models (Claude, GPT, custom Ollama) pass through unchanged. QWEN3_DOWNGRADE_LADDER: tuple[str, ...] = ("qwen3:8b", "qwen3:4b", "qwen3:1.7b")
[docs] def preferred_or_installed_downgrade(model: str) -> list[str]: """Return the sequence of model names to try starting at ``model``. For qwen3 rungs in :data:`QWEN3_DOWNGRADE_LADDER`, returns the ladder from the given rung downward. For any other model, returns a one-element list — we only auto-step qwen3 because the three rungs are behaviourally compatible (same family, same tool-use format, same thinking convention). """ if model in QWEN3_DOWNGRADE_LADDER: start = QWEN3_DOWNGRADE_LADDER.index(model) return list(QWEN3_DOWNGRADE_LADDER[start:]) return [model]
# ---------------------------------------------------------------------------- # AI Assistant / AGENT # ---------------------------------------------------------------------------- # Telemetry lives under STUDY_AUDIT_DIR (not AGENT_STATE_DIR) to keep the # LLM's permitted agent/** zone clear of operator-audit bytes. Per the PHI # rule, LLM must never read telemetry; parking it under audit/ — the same # zone that holds phi_scrub_report.json and dataset_cleanup_report.json — # makes that boundary structural, not a per-file carve-out. TELEMETRY_DIR = STUDY_AUDIT_DIR / "telemetry" TELEMETRY_SINK = TELEMETRY_DIR / "events.jsonl" # Chat / agent AGENT_MAX_TOKENS: int = _get_env_int("AGENT_MAX_TOKENS", 16384) AGENT_TIMEOUT: int = _get_env_int("AGENT_TIMEOUT", 300) CHAT_RATE_LIMIT_WINDOW_SECONDS: int = _get_env_int("CHAT_RATE_LIMIT_WINDOW_SECONDS", 60) CHAT_RATE_LIMIT_MAX_TURNS: int = _get_env_int("CHAT_RATE_LIMIT_MAX_TURNS", 12) # Watchdog on the agent stream: raise TimeoutError if no chunk is produced # for this many seconds. Measures inter-chunk idle time, NOT total wall # clock — so slow-but-steady streams (long tool runs) stay alive. The E3 # benchmark stall went 6+ minutes of total silence with no stop signal; # 180s is ~10x the p99 of a healthy routing step. AGENT_STREAM_IDLE_TIMEOUT: int = _get_env_int("AGENT_STREAM_IDLE_TIMEOUT", 180) # Analytical engine limits ANALYSIS_TIMEOUT: int = _get_env_int("ANALYSIS_TIMEOUT", 300) ANALYSIS_MAX_OUTPUT: int = _get_env_int("ANALYSIS_MAX_OUTPUT", 200_000) ANALYSIS_MAX_FIGURES: int = _get_env_int("ANALYSIS_MAX_FIGURES", 20) # Sandbox subprocess limits — operational tunables (safe to lower; lowering # only tightens the security envelope). The trust boundary itself # (import allowlist, env-var blocklist, AST guards) is hardcoded in # ``scripts.ai_assistant.sandbox`` and is not configurable from here. # # Defaults sized for production runs of the typical pandas+numpy+plotly # stack: numpy alone reserves ~700 MB of address space on Linux when loaded # (RLIMIT_AS is whole-vmap, not RSS). RLIMIT_NPROC is per-user not per-tree # on Linux, so a small cap conflicts with whatever else the host user is # running — 4096 is high enough to coexist with shared CI environments # while still preventing runaway fork bombs. SANDBOX_MAX_MEMORY_MB: int = _get_env_int("SANDBOX_MAX_MEMORY_MB", 2048) SANDBOX_MAX_PROCS: int = _get_env_int("SANDBOX_MAX_PROCS", 4096) SANDBOX_MAX_FILES: int = _get_env_int("SANDBOX_MAX_FILES", 256) SANDBOX_PERSIST_CODE: bool = _get_env("SANDBOX_PERSIST_CODE", "true").lower() in ( "1", "true", "yes", "on", ) # Orchestration mode: "auto" | "single-agent" | "multi-agent" AGENT_ORCHESTRATION_MODE: str = _get_env( "AGENT_ORCHESTRATION_MODE", yaml_get("ai_assistant", "agent", "orchestration_mode", default="auto"), ) # Enforce LangChain tracing OFF by default (privacy-first) os.environ.setdefault("LANGCHAIN_TRACING_V2", "false") # ---------------------------------------------------------------------------- # DIRECTORY CREATION # ----------------------------------------------------------------------------
[docs] def ensure_directories() -> None: """Create runtime directories. Sensitive dirs (containing PHI-scrubbed data, agent state, conversations, audit, or logs) are hardened to mode 0o700 after creation so they're not world-readable under the typical umask 0o022. Dirs that may legitimately need group access (``OUTPUT_DIR`` parent, ``TMP_DIR`` is already 0o700 via secure-staging) are left at default mode.""" sensitive_paths = [ STUDY_OUTPUT_DIR, LOGS_DIR, TRIO_BUNDLE_DIR, TRIO_DATASETS_DIR, DICTIONARY_JSON_OUTPUT_DIR, PDF_EXTRACTIONS_DIR, STUDY_AUDIT_DIR, AGENT_STATE_DIR, AGENT_OUTPUT_DIR, CONVERSATIONS_DIR, TELEMETRY_DIR, # NOTE: ``STUDY_SNAPSHOTS_DIR`` is intentionally NOT created here. # It is a human-reviewed baseline under ``data/snapshots/{STUDY}/``. # Auto-creating it would hide the absence of a reviewed fallback. ] for path in [OUTPUT_DIR, TMP_DIR, *sensitive_paths]: path.mkdir(parents=True, exist_ok=True) import contextlib for path in sensitive_paths: # Best-effort: a chmod failure (e.g., not the file owner) is not a # fatal startup error. with contextlib.suppress(OSError): path.chmod(0o700)
# ---------------------------------------------------------------------------- # VALIDATION # ----------------------------------------------------------------------------
[docs] def validate_config() -> None: # --- PATH VALIDATION --- required_paths = [ RAW_DATA_DIR, STUDY_DATA_DIR, DATASETS_DIR, DATA_DICTIONARY_DIR, ] for path in required_paths: if not path.exists(): raise FileNotFoundError(f"Missing required path: {path}") # PDF source is optional — the pipeline handles its absence gracefully if not ANNOTATED_PDFS_DIR.exists(): logger.warning( "Annotated PDFs directory not found: %s — PDF extraction will be skipped", ANNOTATED_PDFS_DIR, ) # Ensure the dictionary directory contains at least one file if DATA_DICTIONARY_DIR.is_dir() and not any(DATA_DICTIONARY_DIR.iterdir()): raise FileNotFoundError(f"Dictionary directory is empty: {DATA_DICTIONARY_DIR}") # --- LOG FINAL STATE --- logger.info( "Config loaded | study=%s", STUDY_NAME, )