Source code for scripts.extraction.io.clinical_dates

"""Variable-aware date parsing for the Indo-VAP clinical dataset.

The Indo-VAP Excel sheets store dates in **three** distinct ways:

1. **Excel datetime cells** — openpyxl / pandas parse these into Python
   ``datetime`` objects automatically.  No ambiguity.
2. **Slash-delimited text strings** — stored as plain text in the cell.
   The date *order* (month-first vs day-first) **varies per variable**:

   * **Most variables** use US-style **M/D/YYYY** or **M/D/YY**
     (e.g. ``"08/12/2014 12:27:48 PM"``, ``"7/28/14"``).
   * **Six specific variables** use Indian-style **D/M/YYYY** or **D/M/YY**
     (e.g. ``IC_VISDAT="28/05/2014"``, ``IT_IGRADAT="12/12/12"``).

   The canonical set of day-first variables is maintained in
   :data:`DMY_VARIABLES` below.

3. **ISO datetime strings** — ``"2014-07-28"`` or ``"2014-07-28 00:00:00"``.
   Unambiguous; year-month-day order.

This module provides:

* :func:`parse_date` — parse any of the above into a ``datetime``.
* :func:`value_looks_like_date` — quick check for date-like strings.
* :func:`is_dmy_variable` — check if a variable uses D/M order.

All functions are pure (no side effects) and safe to call from any module.

Generated by scanning all 44 raw Excel files (2026-03-25).
"""

from __future__ import annotations

import re
from datetime import datetime
from typing import NamedTuple

__all__ = [
    "DMY_VARIABLES",
    "ParsedDate",
    "is_dmy_variable",
    "parse_date",
    "value_looks_like_date",
]

# ============================================================================
# Per-variable date order — D/M (day-first) variables
# ============================================================================
# The Indo-VAP Excel sheets are inconsistent: most text-string dates use
# US-style M/D/YYYY (month first), but a handful of variables use Indian-style
# D/M/YYYY (day first).  We determined this by scanning ALL rows in ALL 44
# Excel files: if any cell in a column has group1 > 12, that column MUST be
# day-first.  The 6 confirmed D/M variables are listed below.
#
# All other text-string date variables use M/D order (US-style) or are
# ambiguous (both parts ≤ 12) — those default to M/D.
#
# Excel-datetime columns (openpyxl parses to Python datetime) have no
# ambiguity — they are already typed objects.
#
# Verified by scanning all 44 raw Excel files (2026-03-25).

DMY_VARIABLES: frozenset[str] = frozenset(
    {
        "CBC_HBADAT",  # 5_CBC.xlsx            e.g. "25/09/2017"
        "CC_VISDAT",  # 14_Case_Control.xlsx   e.g. "25/08/2015"
        "FOA_VISDAT",  # 98A_FOA.xlsx           e.g. "19/02/2015"
        "FOB_VISDAT",  # 98B_FOB.xlsx           e.g. "24/07/2015"
        "IC_VISDAT",  # 2A_ICBaseline.xlsx     e.g. "28/05/2014"
        "IT_IGRADAT",  # 11_IGRA.xlsx           e.g. "12/12/12" (D/M/YY, 2-digit year)
    }
)
"""Variable names whose slash-date text strings use D/M/YYYY (day first, Indian).

All other slash-date variables default to M/D/YYYY (month first, US).
"""


# ============================================================================
# Compiled regexes
# ============================================================================

# ISO: YYYY-MM-DD [HH:MM:SS]
_ISO_RE = re.compile(
    r"^((?:19|20)\d{2})-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])"
    r"(?:\s+(\d{2}):(\d{2}):(\d{2}))?$"
)

# Slash-delimited: A/B/C [H:M:S [AM|PM]]
# Groups: (1)=first, (2)=second, (3)=year, (4)=hour, (5)=min, (6)=sec, (7)=AM/PM
_SLASH_RE = re.compile(
    r"^(\d{1,2})/(\d{1,2})/(\d{2,4})"
    r"(?:\s+(\d{1,2}):(\d{2}):(\d{2})(?:\s*([AP]M))?)?$",
    re.I,
)

# Quick detection: matches any slash-date (M/D or D/M) with optional time
_SLASH_DETECT_RE = re.compile(
    r"^\d{1,2}/\d{1,2}/\d{2,4}(?:\s+\d{1,2}:\d{2}:\d{2}(?:\s*[AP]M)?)?$",
    re.I,
)

# Quick detection: ISO date
_ISO_DETECT_RE = re.compile(
    r"^(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])"
    r"(?:\s+\d{2}:\d{2}:\d{2})?$"
)


# ============================================================================
# Data structures
# ============================================================================



[docs]
class ParsedDate(NamedTuple):
    """Result of parsing a date string."""

    dt: datetime
    """The parsed datetime. Time component is always ``00:00:00``; the original
    string's time is intentionally discarded for date-granularity anonymization
    (see ``has_time`` and ``ampm`` for source-layout metadata)."""

    has_time: bool
    """True if the original string included a time component."""

    ampm: str | None
    """'AM' or 'PM' if present in the original, else None."""

    format: str
    """One of 'iso', 'mdy', 'dmy' indicating the detected source format."""

    original: str
    """The original (stripped) string that was parsed."""



# ============================================================================
# Variable-name lookup
# ============================================================================



[docs]
def is_dmy_variable(field_name: str) -> bool:
    """Return True if *field_name* is known to use D/M (day-first) date order.

    Checks against the canonical ``DMY_VARIABLES`` frozenset defined in
    this module.  The comparison is exact (case-sensitive) because the
    variable names come from Excel headers.
    """
    return field_name in DMY_VARIABLES



# ============================================================================
# Core parser
# ============================================================================


def _expand_year(y: int) -> int:
    """Expand a 2-digit year: <50 → 2000+, ≥50 → 1900+.  4-digit unchanged."""
    if y < 100:
        return 2000 + y if y < 50 else 1900 + y
    return y



[docs]
def parse_date(value: str, *, field_name: str | None = None) -> ParsedDate | None:
    """Parse a date/datetime text string into a :class:`ParsedDate`.

    Args:
        value: The raw text string (e.g. ``"7/28/14"``, ``"28/05/2014"``,
               ``"2014-07-28 00:00:00"``).
        field_name: The column/variable name the value came from.  Used to
                    determine M/D vs D/M order for slash-delimited dates.
                    If ``None``, defaults to M/D (US-style).

    Returns:
        A :class:`ParsedDate` on success, or ``None`` if the string cannot
        be parsed as a date.

    Examples::

        >>> parse_date("7/28/14")
        ParsedDate(dt=datetime(2014, 7, 28), ..., format='mdy', ...)

        >>> parse_date("28/05/2014", field_name="IC_VISDAT")
        ParsedDate(dt=datetime(2014, 5, 28), ..., format='dmy', ...)

        >>> parse_date("2014-07-28 00:00:00")
        ParsedDate(dt=datetime(2014, 7, 28), ..., format='iso', ...)
    """
    value = value.strip()
    if not value:
        return None

    # ── Try ISO: YYYY-MM-DD [HH:MM:SS] ──
    m = _ISO_RE.match(value)
    if m:
        y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3))
        try:
            dt = datetime(y, mo, d)
        except (ValueError, OverflowError):
            return None
        return ParsedDate(
            dt=dt,
            has_time=m.group(4) is not None,
            ampm=None,
            format="iso",
            original=value,
        )

    # ── Try slash-delimited: A/B/C [H:M:S [AM/PM]] ──
    m = _SLASH_RE.match(value)
    if m:
        g1, g2, g3 = int(m.group(1)), int(m.group(2)), int(m.group(3))
        y = _expand_year(g3)

        # Decide month/day order based on field name
        dmy = field_name is not None and is_dmy_variable(field_name)
        if dmy:
            # D/M/Y: group1=day, group2=month
            day, month = g1, g2
            fmt = "dmy"
        else:
            # M/D/Y: group1=month, group2=day (default)
            month, day = g1, g2
            fmt = "mdy"

        try:
            dt = datetime(y, month, day)
        except (ValueError, OverflowError):
            return None

        ampm = m.group(7).upper() if m.group(7) else None
        return ParsedDate(
            dt=dt,
            has_time=m.group(4) is not None,
            ampm=ampm,
            format=fmt,
            original=value,
        )

    return None



# ============================================================================
# Date detection
# ============================================================================



[docs]
def value_looks_like_date(value: str) -> bool:
    """Return True if *value* looks like a date/datetime string.

    This is a quick check — it does NOT validate the date components.
    """
    v = value.strip()
    return bool(_ISO_DETECT_RE.match(v) or _SLASH_DETECT_RE.match(v))