"""Doc-freshness linter for the RePORT AI Portal.
What. Compares live, source-of-truth values (tool count from
``ALL_TOOLS``, repo version from ``__version__``, action-class count from
``phi_scrub.yaml``) against the prose in README and Sphinx user,
IRB/auditor, and developer guides. Also rejects forbidden phrases that
indicate retired architecture (vector DB / RAG / Presidio-as-active /
"only zone the LLM agent reads" / stale tool counts / stale Make
targets) or inaccessible link text (``click here`` / ``read this
article``).
Why. Three rounds of freshness sweeps converged the docs to current
state, but inline counts and architecture words drift the moment code
changes. Doing this in CI means a future PR that adds a 13th tool — or
removes one — fails the docs-quality-check stage with a precise pointer
to the line(s) that need updating, instead of silently producing stale
docs that the next reviewer has to discover from scratch.
How. Two passes:
1. **Live-value comparison** — import ``ALL_TOOLS`` and
``__version__``, parse ``phi_scrub.yaml`` for action classes,
parse the current code-owned values. For each live value, look for
forbidden patterns ("11 structured-data
tools", "12 callables", etc.) that contradict it. Report
contradictions.
2. **Forbidden-phrase scan** — a curated list of patterns that should
NEVER appear in any tracked doc (vector index claims, "only zone
the LLM agent reads", retired Make targets, etc.).
The linter exits non-zero on any finding, which fails CI. Each finding
prints ``path:line: REASON`` so the dev sees exactly where to look.
Disclaimers ("no chunking, no embedding") are passed through allowlist
patterns that match the canonical phrasing.
"""
from __future__ import annotations
import re
import sys
from collections.abc import Iterable
from dataclasses import dataclass
from pathlib import Path
# Repository root resolved relative to this script.
REPO_ROOT = Path(__file__).resolve().parent.parent
# Files / directories the linter scans for prose drift.
TRACKED_FILES: tuple[str, ...] = ("README.md",)
TRACKED_DIRS: tuple[str, ...] = ("docs/sphinx",)
TRACKED_GLOBS: tuple[str, ...] = (
"*.rst",
"*.md",
)
# Directory names anywhere in a path that mark generated / vendored content.
# Matched as path *parts* so e.g. ``docs/sphinx/_build/html/index.html`` is
# excluded, but a hypothetical ``docs/sphinx/_buildguide.rst`` (no such file
# today) is not.
EXCLUDED_PATH_PARTS: frozenset[str] = frozenset(
{
"_build",
"_static",
"_templates",
".venv",
"venv",
"node_modules",
"__pycache__",
".pytest_cache",
".mypy_cache",
".ruff_cache",
}
)
[docs]
@dataclass(frozen=True)
class Finding:
"""One drift instance: file, line, and the reason it's stale."""
path: Path
line_no: int
line: str
reason: str
[docs]
def render(self) -> str:
rel = self.path.relative_to(REPO_ROOT)
return f"{rel}:{self.line_no}: {self.reason}\n > {self.line.rstrip()}"
def _iter_tracked_files() -> Iterable[Path]:
"""Yield every tracked doc file as an absolute Path.
Skips paths that contain any segment in :data:`EXCLUDED_PATH_PARTS`,
so locally-generated build output (e.g., ``docs/sphinx/_build/``,
which is gitignored but commonly present after ``make docs``) does
not produce noisy false positives or slow the linter down.
"""
for name in TRACKED_FILES:
candidate = REPO_ROOT / name
if candidate.is_file():
yield candidate
for directory in TRACKED_DIRS:
base = REPO_ROOT / directory
if not base.is_dir():
continue
for glob in TRACKED_GLOBS:
for path in base.rglob(glob):
if EXCLUDED_PATH_PARTS.intersection(path.parts):
continue
yield path
def _live_tool_count() -> int:
"""Read the canonical ``ALL_TOOLS`` length without importing the package.
Avoids the import-time side effects of ``scripts.ai_assistant.agent_tools``
(langchain, ollama, etc. — heavy and not always installed). Instead,
parse the literal list from the source file.
"""
src = (REPO_ROOT / "scripts" / "ai_assistant" / "agent_tools.py").read_text()
match = re.search(r"ALL_TOOLS\s*=\s*\[(.*?)\]", src, re.DOTALL)
if not match:
raise RuntimeError("ALL_TOOLS literal not found in agent_tools.py")
body = match.group(1)
# Count non-empty, non-comment lines that look like a bare identifier.
entries = [
line.strip().rstrip(",")
for line in body.splitlines()
if line.strip() and not line.strip().startswith("#")
]
return len(entries)
def _live_version() -> str:
"""Read ``__version__`` literal from the canonical source file."""
src = (REPO_ROOT / "__version__.py").read_text()
match = re.search(r'__version__\s*:\s*str\s*=\s*"([^"]+)"', src)
if not match:
raise RuntimeError("__version__ literal not found in __version__.py")
return match.group(1)
def _live_action_class_count() -> int:
"""Count distinct action classes in ``scripts/security/phi_scrub.yaml``.
The catalog ships eight: keep / birthdate / drop / cap / generalize /
suppress_small_cell / date_jitter / id_pseudonymize. Each appears as
a top-level YAML key (``<name>_fields:`` or ``<name>_field:``).
"""
yaml_path = REPO_ROOT / "scripts" / "security" / "phi_scrub.yaml"
if not yaml_path.is_file():
return 8 # fall back to documented constant
expected = {
"keep_fields",
"birthdate_field",
"drop_fields",
"cap_fields",
"generalize_fields",
"suppress_small_cell_fields",
"date_fields",
"id_fields",
}
seen: set[str] = set()
for line in yaml_path.read_text().splitlines():
head = line.split(":", 1)[0].strip()
if head in expected:
seen.add(head)
return len(seen) or 8
# ---------------------------------------------------------------------------
# Forbidden phrases — each entry is (regex, reason). Matches are reported
# unless the line ALSO matches one of the allowlist patterns paired with the
# entry. Patterns are case-insensitive unless noted.
# ---------------------------------------------------------------------------
FORBIDDEN: tuple[tuple[str, str, tuple[str, ...]], ...] = (
# Stale tool counts. Match phrasing like "12 tools", "12 structured-data
# tools", "12 callables", "fixed list of 12 callables", etc. Plural is the
# normal doc form, so the trailing noun MUST allow ``s?``.
(
r"\b(\d+)\s+(?:structured(?:[-\s]?data)?|callable|@tool)s?\s+(?:tools?|callables?)\b",
"stale tool count — current ALL_TOOLS length is {tool_count}",
(),
),
(
r"\bfixed\s+list\s+of\s+(\d+)\s+callables?\b",
"stale tool count — current ALL_TOOLS length is {tool_count}",
(),
),
# Bare "N tools" / "N callables" without a leading qualifier. Catches
# phrasings like "12 tools" alone in prose.
(
r"\b(\d+)\s+(?:tools?|callables?)\b(?!\s+(?:run|exec|invocation))",
"stale tool count — current ALL_TOOLS length is {tool_count}",
(
# Allowlist — when the cited count matches live ALL_TOOLS, it's
# picked up by the count-equality short-circuit below.
),
),
# Vector-DB / RAG / chunking as architectural CLAIMS (not disclaimers)
(
r"vector\s+(db|store|index|database)",
"vector DB residue — pipeline does not build a vector index",
(
r"no\s+(chunking|embedding|vector)",
r"without\s+(chunking|embedding|vector)",
r"vector\s+db\s+integration\s*\(if", # future-work note
r"how\s+the\s+agent\s+reads\s+the\s+bundle.*no\s+vector",
),
),
(
r"\bembedding\s+(index|store|model|search)",
"embeddings residue — pipeline has no embedding step",
(r"no\s+embedding", r"without\s+embedding"),
),
(
r"\bsemantic\s+(search|retrieval)",
"semantic search residue — agent uses {tool_count} structured tools, no semantic retrieval",
(r"no\s+semantic\s+(search|retrieval)",),
),
# "Only zone" / "agent never exposed" residue
(
r"the\s+only\s+zone\s+the\s+(llm\s+)?agent",
'"only zone" residue — current LLM read zone is trio_bundle + agent',
(),
),
(
r"only\s+zone\s+the\s+downstream",
'"only zone" residue — current LLM read zone is trio_bundle + agent',
(),
),
# 35-criterion / 31-criterion bare (without follow-up qualifier)
(
r"35-?criterion",
"35-criterion needs the qualifier '(31 original + 4 added via patches 2026-04-23a/b)'",
(
r"31\s*original",
r"31\s*\+\s*4",
r"four\s+follow-?ups",
r"plus\s+four\s+follow-?ups",
r"4\s+added",
r"35\s*/\s*35\s*criteria\s+architecturally",
),
),
# Stale Makefile target names
(
r"\bmake\s+extract-pdfs\b",
"stale Make target — use `make pdf-extract`",
(),
),
# Pre-scrubbed wording (operators don't pre-scrub; pipeline does at Step 1.6)
(
r"datasets?\s+must\s+be\s+pre-?scrubbed",
'"pre-scrubbed" residue — pipeline scrubs at Step 1.6 on AMBER staging',
(),
),
(
r"reads?\s+pre-?scrubbed\s+study",
'"pre-scrubbed" residue — Step 1.6 in-pipeline scrub is canonical',
(),
),
(
r"\.xls(?!x)",
"legacy .xls residue — supported tabular inputs are .xlsx and .csv only",
(),
),
(
r"xlsx,\s*xls,\s*csv",
"legacy .xls residue — supported tabular inputs are .xlsx and .csv only",
(),
),
(
r"\bvllm\b",
"stale provider claim — supported provider IDs are openai, anthropic, google-genai, ollama, nvidia-ai-endpoints",
(),
),
(
r"test_dataset_extraction\.py",
"stale test filename — dataset extraction coverage lives in tests/test_dataset_pipeline.py",
(),
),
(
r"test_date_transform\.py",
"stale test filename — SANT/date coverage lives in tests/test_phi_scrub.py",
(),
),
(
r"ai_assistant/\s+#.*planned",
"stale test-tree claim — AI Assistant tests are active top-level tests, not a planned tests/ai_assistant folder",
(),
),
(
r"extraction/\s+#.*planned",
"stale test-tree claim — extraction tests are active top-level tests, not a planned tests/extraction folder",
(),
),
(
r"\b(?:80|90|100)%\s+(?:code\s+)?coverage",
"coverage threshold claim is not enforced by current CI; document runnable gates instead",
(),
),
(
r"\bclick\s+here\b",
"vague link text — name the destination or action",
(),
),
(
r"\bread\s+this\s+(article|document|page)\b",
"vague link text — name the destination or action",
(),
),
# Stale streamlit version pin
(
r"streamlit\s+1\.5\d",
"stale Streamlit pin — pyproject.toml requires >=1.38, <2.0",
(),
),
# Stale Llama default
(
r"\bllama-?[23]\b\s+(default|model|provider)",
"stale default model — qwen3 replaced llama since commit df52ec4",
(),
),
# "PDF-snippet sanitiser" gap claims (closed in patch-2026-04-23a)
(
r"no\s+pdf-?snippet\s+(instruction\s+)?sanitiser\s+(today|currently)",
"patch-2026-04-23a closed this gap (sanitise_untrusted_snippet)",
(),
),
(
r"sanitiser\s+is\s+the\s+planned\s+hardening",
"patch-2026-04-23a already shipped this hardening",
(),
),
# Old ONLY-zone residue in IRB diagrams (handled separately above; keep
# an additional pattern for the ASCII variant)
(
r"<--\s*the\s+only\s+zone",
'"the ONLY zone" residue — current LLM read zone is trio_bundle + agent',
(),
),
# ``__version__`` references must agree with the canonical literal in
# ``__version__.py``. Match patterns like ``__version__ = "0.x.y"``,
# ``"version": "0.x.y"``, or "current version 0.x.y" in prose. The
# version-equality short-circuit in :func:`_check_file` skips matches
# whose digits equal the live version.
(
# Match both ``__version__ = "X.Y.Z"`` and the annotated
# ``__version__: str = "X.Y.Z"`` forms used in docs/code samples.
r"__version__[^\"\n]*\"\d+\.\d+\.\d+\"",
"stale __version__ literal — canonical is {version}",
(),
),
(
r"current\s+version\s*:?\s*\d+\.\d+\.\d+",
"stale 'current version' claim — canonical is {version}",
(),
),
# Action-class catalog count drift ("8-action catalog", "7-action
# classes", etc.). The action-count short-circuit in
# :func:`_check_file` skips correct-number matches.
(
r"\b\d+[-\s]+action\s+(?:catalog|catalogue|classes?|set)\b",
"stale action-class count — canonical is {action_count} classes (see scripts/security/phi_scrub.yaml)",
(),
),
)
def _check_file(
path: Path,
*,
tool_count: int,
version: str,
action_count: int,
) -> list[Finding]:
"""Return every drift Finding produced by ``path``.
Disclaimer allowlists are evaluated on a 5-line window (the matching
line plus the two before and two after) so wrapped prose like::
... directly. No chunking, embedding, or
vector index is needed.
is recognised as a disclaimer even when "vector" lands on a separate
line from "no".
"""
findings: list[Finding] = []
try:
text = path.read_text()
except (OSError, UnicodeDecodeError):
return findings
lines = text.splitlines()
lower_lines = [line.lower() for line in lines]
for index, raw in enumerate(lines):
line_no = index + 1
lower = lower_lines[index]
# 5-line window: 2 before, current, 2 after.
window_lo = max(0, index - 2)
window_hi = min(len(lower_lines), index + 3)
window_text = " ".join(lower_lines[window_lo:window_hi])
for pattern, reason_tmpl, allowlist in FORBIDDEN:
if not re.search(pattern, lower):
continue
if any(re.search(allow, window_text) for allow in allowlist):
continue
# Special-case the tool-count regexes: the patterns match both
# correct (e.g., "12 tools") and stale (e.g., "10 tools") cases.
# Skip when the captured number equals the live count. Match
# plural and singular noun heads so "tools" / "callables" /
# "@tool" / "structured-data tools" all resolve the count.
count_match = re.search(
r"\b(\d+)\s+(?:structured(?:[-\s]?data)?\s+)?(?:tools?|callables?|@?tool)",
lower,
)
if count_match:
cited = int(count_match.group(1))
if cited == tool_count:
continue
# Action-class count drift ("8-action catalog", "eight action
# classes" — only the digit form is auto-checked here; the
# spelled-out form is left alone because it doesn't drift in
# practice).
action_match = re.search(
r"\b(\d+)[-\s]+action\s+(?:catalog|catalogue|classes?|set)",
lower,
)
if action_match:
cited = int(action_match.group(1))
if cited == action_count:
continue
# Version drift — only run the equality short-circuit when the
# matched FORBIDDEN entry is itself version-related (otherwise an
# unrelated FORBIDDEN match on a line that happens to mention the
# live version would be silently dropped).
if "version" in pattern.lower():
version_match = re.search(r"\b(\d+\.\d+\.\d+)\b", lower)
if version_match and version_match.group(1) == version:
continue
reason = reason_tmpl.format(
tool_count=tool_count,
version=version,
action_count=action_count,
)
findings.append(Finding(path=path, line_no=line_no, line=raw, reason=reason))
return findings
[docs]
def main() -> int:
"""Run every check and return a process exit code (0 = clean, 1 = drift)."""
tool_count = _live_tool_count()
version = _live_version()
action_count = _live_action_class_count()
print(
f"[doc-freshness] live values: ALL_TOOLS={tool_count}, "
f"__version__={version}, action_classes={action_count}",
file=sys.stderr,
)
findings: list[Finding] = []
for path in sorted(_iter_tracked_files()):
findings.extend(
_check_file(
path,
tool_count=tool_count,
version=version,
action_count=action_count,
)
)
if not findings:
print("[doc-freshness] OK — no stale-doc drift detected.", file=sys.stderr)
return 0
print(f"[doc-freshness] FAIL — {len(findings)} drift(s):", file=sys.stderr)
for finding in findings:
print(finding.render(), file=sys.stderr)
print(
"\nRefresh the offending lines, then re-run "
"`uv run --frozen python scripts/lint_doc_freshness.py`.",
file=sys.stderr,
)
return 1
if __name__ == "__main__":
raise SystemExit(main())