Source code for scripts.lint_doc_freshness

"""Doc-freshness linter for the RePORT AI Portal.

What. Compares live, source-of-truth values (tool count from
``ALL_TOOLS``, repo version from ``__version__``, action-class count from
``phi_scrub.yaml``) against the prose in README and Sphinx user,
IRB/auditor, and developer guides. Also rejects forbidden phrases that
indicate retired architecture (vector DB / RAG / Presidio-as-active /
"only zone the LLM agent reads" / stale tool counts / stale Make
targets) or inaccessible link text (``click here`` / ``read this
article``).

Why. Three rounds of freshness sweeps converged the docs to current
state, but inline counts and architecture words drift the moment code
changes. Doing this in CI means a future PR that adds a 13th tool — or
removes one — fails the docs-quality-check stage with a precise pointer
to the line(s) that need updating, instead of silently producing stale
docs that the next reviewer has to discover from scratch.

How. Two passes:

1. **Live-value comparison** — import ``ALL_TOOLS`` and
   ``__version__``, parse ``phi_scrub.yaml`` for action classes,
   parse the current code-owned values. For each live value, look for
   forbidden patterns ("11 structured-data
   tools", "12 callables", etc.) that contradict it. Report
   contradictions.
2. **Forbidden-phrase scan** — a curated list of patterns that should
   NEVER appear in any tracked doc (vector index claims, "only zone
   the LLM agent reads", retired Make targets, etc.).

The linter exits non-zero on any finding, which fails CI. Each finding
prints ``path:line: REASON`` so the dev sees exactly where to look.
Disclaimers ("no chunking, no embedding") are passed through allowlist
patterns that match the canonical phrasing.
"""

from __future__ import annotations

import re
import sys
from collections.abc import Iterable
from dataclasses import dataclass
from pathlib import Path

# Repository root resolved relative to this script.
REPO_ROOT = Path(__file__).resolve().parent.parent

# Files / directories the linter scans for prose drift.
TRACKED_FILES: tuple[str, ...] = ("README.md",)
TRACKED_DIRS: tuple[str, ...] = ("docs/sphinx",)
TRACKED_GLOBS: tuple[str, ...] = (
    "*.rst",
    "*.md",
)
# Directory names anywhere in a path that mark generated / vendored content.
# Matched as path *parts* so e.g. ``docs/sphinx/_build/html/index.html`` is
# excluded, but a hypothetical ``docs/sphinx/_buildguide.rst`` (no such file
# today) is not.
EXCLUDED_PATH_PARTS: frozenset[str] = frozenset(
    {
        "_build",
        "_static",
        "_templates",
        ".venv",
        "venv",
        "node_modules",
        "__pycache__",
        ".pytest_cache",
        ".mypy_cache",
        ".ruff_cache",
    }
)



[docs]
@dataclass(frozen=True)
class Finding:
    """One drift instance: file, line, and the reason it's stale."""

    path: Path
    line_no: int
    line: str
    reason: str


[docs]
    def render(self) -> str:
        rel = self.path.relative_to(REPO_ROOT)
        return f"{rel}:{self.line_no}: {self.reason}\n    > {self.line.rstrip()}"




def _iter_tracked_files() -> Iterable[Path]:
    """Yield every tracked doc file as an absolute Path.

    Skips paths that contain any segment in :data:`EXCLUDED_PATH_PARTS`,
    so locally-generated build output (e.g., ``docs/sphinx/_build/``,
    which is gitignored but commonly present after ``make docs``) does
    not produce noisy false positives or slow the linter down.
    """
    for name in TRACKED_FILES:
        candidate = REPO_ROOT / name
        if candidate.is_file():
            yield candidate
    for directory in TRACKED_DIRS:
        base = REPO_ROOT / directory
        if not base.is_dir():
            continue
        for glob in TRACKED_GLOBS:
            for path in base.rglob(glob):
                if EXCLUDED_PATH_PARTS.intersection(path.parts):
                    continue
                yield path


def _live_tool_count() -> int:
    """Read the canonical ``ALL_TOOLS`` length without importing the package.

    Avoids the import-time side effects of ``scripts.ai_assistant.agent_tools``
    (langchain, ollama, etc. — heavy and not always installed). Instead,
    parse the literal list from the source file.
    """
    src = (REPO_ROOT / "scripts" / "ai_assistant" / "agent_tools.py").read_text()
    match = re.search(r"ALL_TOOLS\s*=\s*\[(.*?)\]", src, re.DOTALL)
    if not match:
        raise RuntimeError("ALL_TOOLS literal not found in agent_tools.py")
    body = match.group(1)
    # Count non-empty, non-comment lines that look like a bare identifier.
    entries = [
        line.strip().rstrip(",")
        for line in body.splitlines()
        if line.strip() and not line.strip().startswith("#")
    ]
    return len(entries)


def _live_version() -> str:
    """Read ``__version__`` literal from the canonical source file."""
    src = (REPO_ROOT / "__version__.py").read_text()
    match = re.search(r'__version__\s*:\s*str\s*=\s*"([^"]+)"', src)
    if not match:
        raise RuntimeError("__version__ literal not found in __version__.py")
    return match.group(1)


def _live_action_class_count() -> int:
    """Count distinct action classes in ``scripts/security/phi_scrub.yaml``.

    The catalog ships eight: keep / birthdate / drop / cap / generalize /
    suppress_small_cell / date_jitter / id_pseudonymize. Each appears as
    a top-level YAML key (``<name>_fields:`` or ``<name>_field:``).
    """
    yaml_path = REPO_ROOT / "scripts" / "security" / "phi_scrub.yaml"
    if not yaml_path.is_file():
        return 8  # fall back to documented constant
    expected = {
        "keep_fields",
        "birthdate_field",
        "drop_fields",
        "cap_fields",
        "generalize_fields",
        "suppress_small_cell_fields",
        "date_fields",
        "id_fields",
    }
    seen: set[str] = set()
    for line in yaml_path.read_text().splitlines():
        head = line.split(":", 1)[0].strip()
        if head in expected:
            seen.add(head)
    return len(seen) or 8


# ---------------------------------------------------------------------------
# Forbidden phrases — each entry is (regex, reason). Matches are reported
# unless the line ALSO matches one of the allowlist patterns paired with the
# entry. Patterns are case-insensitive unless noted.
# ---------------------------------------------------------------------------

FORBIDDEN: tuple[tuple[str, str, tuple[str, ...]], ...] = (
    # Stale tool counts. Match phrasing like "12 tools", "12 structured-data
    # tools", "12 callables", "fixed list of 12 callables", etc. Plural is the
    # normal doc form, so the trailing noun MUST allow ``s?``.
    (
        r"\b(\d+)\s+(?:structured(?:[-\s]?data)?|callable|@tool)s?\s+(?:tools?|callables?)\b",
        "stale tool count — current ALL_TOOLS length is {tool_count}",
        (),
    ),
    (
        r"\bfixed\s+list\s+of\s+(\d+)\s+callables?\b",
        "stale tool count — current ALL_TOOLS length is {tool_count}",
        (),
    ),
    # Bare "N tools" / "N callables" without a leading qualifier. Catches
    # phrasings like "12 tools" alone in prose.
    (
        r"\b(\d+)\s+(?:tools?|callables?)\b(?!\s+(?:run|exec|invocation))",
        "stale tool count — current ALL_TOOLS length is {tool_count}",
        (
            # Allowlist — when the cited count matches live ALL_TOOLS, it's
            # picked up by the count-equality short-circuit below.
        ),
    ),
    # Vector-DB / RAG / chunking as architectural CLAIMS (not disclaimers)
    (
        r"vector\s+(db|store|index|database)",
        "vector DB residue — pipeline does not build a vector index",
        (
            r"no\s+(chunking|embedding|vector)",
            r"without\s+(chunking|embedding|vector)",
            r"vector\s+db\s+integration\s*\(if",  # future-work note
            r"how\s+the\s+agent\s+reads\s+the\s+bundle.*no\s+vector",
        ),
    ),
    (
        r"\bembedding\s+(index|store|model|search)",
        "embeddings residue — pipeline has no embedding step",
        (r"no\s+embedding", r"without\s+embedding"),
    ),
    (
        r"\bsemantic\s+(search|retrieval)",
        "semantic search residue — agent uses {tool_count} structured tools, no semantic retrieval",
        (r"no\s+semantic\s+(search|retrieval)",),
    ),
    # "Only zone" / "agent never exposed" residue
    (
        r"the\s+only\s+zone\s+the\s+(llm\s+)?agent",
        '"only zone" residue — current LLM read zone is trio_bundle + agent',
        (),
    ),
    (
        r"only\s+zone\s+the\s+downstream",
        '"only zone" residue — current LLM read zone is trio_bundle + agent',
        (),
    ),
    # 35-criterion / 31-criterion bare (without follow-up qualifier)
    (
        r"35-?criterion",
        "35-criterion needs the qualifier '(31 original + 4 added via patches 2026-04-23a/b)'",
        (
            r"31\s*original",
            r"31\s*\+\s*4",
            r"four\s+follow-?ups",
            r"plus\s+four\s+follow-?ups",
            r"4\s+added",
            r"35\s*/\s*35\s*criteria\s+architecturally",
        ),
    ),
    # Stale Makefile target names
    (
        r"\bmake\s+extract-pdfs\b",
        "stale Make target — use `make pdf-extract`",
        (),
    ),
    # Pre-scrubbed wording (operators don't pre-scrub; pipeline does at Step 1.6)
    (
        r"datasets?\s+must\s+be\s+pre-?scrubbed",
        '"pre-scrubbed" residue — pipeline scrubs at Step 1.6 on AMBER staging',
        (),
    ),
    (
        r"reads?\s+pre-?scrubbed\s+study",
        '"pre-scrubbed" residue — Step 1.6 in-pipeline scrub is canonical',
        (),
    ),
    (
        r"\.xls(?!x)",
        "legacy .xls residue — supported tabular inputs are .xlsx and .csv only",
        (),
    ),
    (
        r"xlsx,\s*xls,\s*csv",
        "legacy .xls residue — supported tabular inputs are .xlsx and .csv only",
        (),
    ),
    (
        r"\bvllm\b",
        "stale provider claim — supported provider IDs are openai, anthropic, google-genai, ollama, nvidia-ai-endpoints",
        (),
    ),
    (
        r"test_dataset_extraction\.py",
        "stale test filename — dataset extraction coverage lives in tests/test_dataset_pipeline.py",
        (),
    ),
    (
        r"test_date_transform\.py",
        "stale test filename — SANT/date coverage lives in tests/test_phi_scrub.py",
        (),
    ),
    (
        r"ai_assistant/\s+#.*planned",
        "stale test-tree claim — AI Assistant tests are active top-level tests, not a planned tests/ai_assistant folder",
        (),
    ),
    (
        r"extraction/\s+#.*planned",
        "stale test-tree claim — extraction tests are active top-level tests, not a planned tests/extraction folder",
        (),
    ),
    (
        r"\b(?:80|90|100)%\s+(?:code\s+)?coverage",
        "coverage threshold claim is not enforced by current CI; document runnable gates instead",
        (),
    ),
    (
        r"\bclick\s+here\b",
        "vague link text — name the destination or action",
        (),
    ),
    (
        r"\bread\s+this\s+(article|document|page)\b",
        "vague link text — name the destination or action",
        (),
    ),
    # Stale streamlit version pin
    (
        r"streamlit\s+1\.5\d",
        "stale Streamlit pin — pyproject.toml requires >=1.38, <2.0",
        (),
    ),
    # Stale Llama default
    (
        r"\bllama-?[23]\b\s+(default|model|provider)",
        "stale default model — qwen3 replaced llama since commit df52ec4",
        (),
    ),
    # "PDF-snippet sanitiser" gap claims (closed in patch-2026-04-23a)
    (
        r"no\s+pdf-?snippet\s+(instruction\s+)?sanitiser\s+(today|currently)",
        "patch-2026-04-23a closed this gap (sanitise_untrusted_snippet)",
        (),
    ),
    (
        r"sanitiser\s+is\s+the\s+planned\s+hardening",
        "patch-2026-04-23a already shipped this hardening",
        (),
    ),
    # Old ONLY-zone residue in IRB diagrams (handled separately above; keep
    # an additional pattern for the ASCII variant)
    (
        r"<--\s*the\s+only\s+zone",
        '"the ONLY zone" residue — current LLM read zone is trio_bundle + agent',
        (),
    ),
    # ``__version__`` references must agree with the canonical literal in
    # ``__version__.py``. Match patterns like ``__version__ = "0.x.y"``,
    # ``"version": "0.x.y"``, or "current version 0.x.y" in prose. The
    # version-equality short-circuit in :func:`_check_file` skips matches
    # whose digits equal the live version.
    (
        # Match both ``__version__ = "X.Y.Z"`` and the annotated
        # ``__version__: str = "X.Y.Z"`` forms used in docs/code samples.
        r"__version__[^\"\n]*\"\d+\.\d+\.\d+\"",
        "stale __version__ literal — canonical is {version}",
        (),
    ),
    (
        r"current\s+version\s*:?\s*\d+\.\d+\.\d+",
        "stale 'current version' claim — canonical is {version}",
        (),
    ),
    # Action-class catalog count drift ("8-action catalog", "7-action
    # classes", etc.). The action-count short-circuit in
    # :func:`_check_file` skips correct-number matches.
    (
        r"\b\d+[-\s]+action\s+(?:catalog|catalogue|classes?|set)\b",
        "stale action-class count — canonical is {action_count} classes (see scripts/security/phi_scrub.yaml)",
        (),
    ),
)


def _check_file(
    path: Path,
    *,
    tool_count: int,
    version: str,
    action_count: int,
) -> list[Finding]:
    """Return every drift Finding produced by ``path``.

    Disclaimer allowlists are evaluated on a 5-line window (the matching
    line plus the two before and two after) so wrapped prose like::

        ... directly. No chunking, embedding, or
        vector index is needed.

    is recognised as a disclaimer even when "vector" lands on a separate
    line from "no".
    """
    findings: list[Finding] = []
    try:
        text = path.read_text()
    except (OSError, UnicodeDecodeError):
        return findings

    lines = text.splitlines()
    lower_lines = [line.lower() for line in lines]

    for index, raw in enumerate(lines):
        line_no = index + 1
        lower = lower_lines[index]
        # 5-line window: 2 before, current, 2 after.
        window_lo = max(0, index - 2)
        window_hi = min(len(lower_lines), index + 3)
        window_text = " ".join(lower_lines[window_lo:window_hi])

        for pattern, reason_tmpl, allowlist in FORBIDDEN:
            if not re.search(pattern, lower):
                continue
            if any(re.search(allow, window_text) for allow in allowlist):
                continue

            # Special-case the tool-count regexes: the patterns match both
            # correct (e.g., "12 tools") and stale (e.g., "10 tools") cases.
            # Skip when the captured number equals the live count. Match
            # plural and singular noun heads so "tools" / "callables" /
            # "@tool" / "structured-data tools" all resolve the count.
            count_match = re.search(
                r"\b(\d+)\s+(?:structured(?:[-\s]?data)?\s+)?(?:tools?|callables?|@?tool)",
                lower,
            )
            if count_match:
                cited = int(count_match.group(1))
                if cited == tool_count:
                    continue
            # Action-class count drift ("8-action catalog", "eight action
            # classes" — only the digit form is auto-checked here; the
            # spelled-out form is left alone because it doesn't drift in
            # practice).
            action_match = re.search(
                r"\b(\d+)[-\s]+action\s+(?:catalog|catalogue|classes?|set)",
                lower,
            )
            if action_match:
                cited = int(action_match.group(1))
                if cited == action_count:
                    continue
            # Version drift — only run the equality short-circuit when the
            # matched FORBIDDEN entry is itself version-related (otherwise an
            # unrelated FORBIDDEN match on a line that happens to mention the
            # live version would be silently dropped).
            if "version" in pattern.lower():
                version_match = re.search(r"\b(\d+\.\d+\.\d+)\b", lower)
                if version_match and version_match.group(1) == version:
                    continue

            reason = reason_tmpl.format(
                tool_count=tool_count,
                version=version,
                action_count=action_count,
            )
            findings.append(Finding(path=path, line_no=line_no, line=raw, reason=reason))
    return findings



[docs]
def main() -> int:
    """Run every check and return a process exit code (0 = clean, 1 = drift)."""
    tool_count = _live_tool_count()
    version = _live_version()
    action_count = _live_action_class_count()

    print(
        f"[doc-freshness] live values: ALL_TOOLS={tool_count}, "
        f"__version__={version}, action_classes={action_count}",
        file=sys.stderr,
    )

    findings: list[Finding] = []
    for path in sorted(_iter_tracked_files()):
        findings.extend(
            _check_file(
                path,
                tool_count=tool_count,
                version=version,
                action_count=action_count,
            )
        )

    if not findings:
        print("[doc-freshness] OK — no stale-doc drift detected.", file=sys.stderr)
        return 0

    print(f"[doc-freshness] FAIL — {len(findings)} drift(s):", file=sys.stderr)
    for finding in findings:
        print(finding.render(), file=sys.stderr)
    print(
        "\nRefresh the offending lines, then re-run "
        "`uv run --frozen python scripts/lint_doc_freshness.py`.",
        file=sys.stderr,
    )
    return 1



if __name__ == "__main__":
    raise SystemExit(main())