Source code for scripts.extraction.io.clinical_dates

"""Variable-aware date parsing for the Indo-VAP clinical dataset.

The Indo-VAP Excel sheets store dates in **three** distinct ways:

1. **Excel datetime cells** — openpyxl / pandas parse these into Python
   ``datetime`` objects automatically.  No ambiguity.
2. **Slash-delimited text strings** — stored as plain text in the cell.
   The date *order* (month-first vs day-first) **varies per variable**:

   * **Most variables** use US-style **M/D/YYYY** or **M/D/YY**
     (e.g. ``"08/12/2014 12:27:48 PM"``, ``"7/28/14"``).
   * **Six specific variables** use Indian-style **D/M/YYYY** or **D/M/YY**
     (e.g. ``IC_VISDAT="28/05/2014"``, ``IT_IGRADAT="12/12/12"``).

   The canonical set of day-first variables is maintained in
   :data:`DMY_VARIABLES` below.

3. **ISO datetime strings** — ``"2014-07-28"`` or ``"2014-07-28 00:00:00"``.
   Unambiguous; year-month-day order.

This module provides:

* :func:`parse_date` — parse any of the above into a ``datetime``.
* :func:`value_looks_like_date` — quick check for date-like strings.
* :func:`is_dmy_variable` — check if a variable uses D/M order.

All functions are pure (no side effects) and safe to call from any module.

Generated by scanning all 44 raw Excel files (2026-03-25).
"""

from __future__ import annotations

import re
from datetime import datetime
from typing import NamedTuple

__all__ = [
    "DMY_VARIABLES",
    "ParsedDate",
    "is_dmy_variable",
    "parse_date",
    "value_looks_like_date",
]

# ============================================================================
# Per-variable date order — D/M (day-first) variables
# ============================================================================
# The Indo-VAP Excel sheets are inconsistent: most text-string dates use
# US-style M/D/YYYY (month first), but a handful of variables use Indian-style
# D/M/YYYY (day first).  We determined this by scanning ALL rows in ALL 44
# Excel files: if any cell in a column has group1 > 12, that column MUST be
# day-first.  The 6 confirmed D/M variables are listed below.
#
# All other text-string date variables use M/D order (US-style) or are
# ambiguous (both parts ≤ 12) — those default to M/D.
#
# Excel-datetime columns (openpyxl parses to Python datetime) have no
# ambiguity — they are already typed objects.
#
# Verified by scanning all 44 raw Excel files (2026-03-25).

DMY_VARIABLES: frozenset[str] = frozenset(
    {
        "CBC_HBADAT",  # 5_CBC.xlsx            e.g. "25/09/2017"
        "CC_VISDAT",  # 14_Case_Control.xlsx   e.g. "25/08/2015"
        "FOA_VISDAT",  # 98A_FOA.xlsx           e.g. "19/02/2015"
        "FOB_VISDAT",  # 98B_FOB.xlsx           e.g. "24/07/2015"
        "IC_VISDAT",  # 2A_ICBaseline.xlsx     e.g. "28/05/2014"
        "IT_IGRADAT",  # 11_IGRA.xlsx           e.g. "12/12/12" (D/M/YY, 2-digit year)
    }
)
"""Variable names whose slash-date text strings use D/M/YYYY (day first, Indian).

All other slash-date variables default to M/D/YYYY (month first, US).
"""


# ============================================================================
# Compiled regexes
# ============================================================================

# ISO: YYYY-MM-DD [HH:MM:SS]
_ISO_RE = re.compile(
    r"^((?:19|20)\d{2})-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])"
    r"(?:\s+(\d{2}):(\d{2}):(\d{2}))?$"
)

# Slash-delimited: A/B/C [H:M:S [AM|PM]]
# Groups: (1)=first, (2)=second, (3)=year, (4)=hour, (5)=min, (6)=sec, (7)=AM/PM
_SLASH_RE = re.compile(
    r"^(\d{1,2})/(\d{1,2})/(\d{2,4})"
    r"(?:\s+(\d{1,2}):(\d{2}):(\d{2})(?:\s*([AP]M))?)?$",
    re.I,
)

# Quick detection: matches any slash-date (M/D or D/M) with optional time
_SLASH_DETECT_RE = re.compile(
    r"^\d{1,2}/\d{1,2}/\d{2,4}(?:\s+\d{1,2}:\d{2}:\d{2}(?:\s*[AP]M)?)?$",
    re.I,
)

# Quick detection: ISO date
_ISO_DETECT_RE = re.compile(
    r"^(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])"
    r"(?:\s+\d{2}:\d{2}:\d{2})?$"
)


# ============================================================================
# Data structures
# ============================================================================


[docs] class ParsedDate(NamedTuple): """Result of parsing a date string.""" dt: datetime """The parsed datetime. Time component is always ``00:00:00``; the original string's time is intentionally discarded for date-granularity anonymization (see ``has_time`` and ``ampm`` for source-layout metadata).""" has_time: bool """True if the original string included a time component.""" ampm: str | None """'AM' or 'PM' if present in the original, else None.""" format: str """One of 'iso', 'mdy', 'dmy' indicating the detected source format.""" original: str """The original (stripped) string that was parsed."""
# ============================================================================ # Variable-name lookup # ============================================================================
[docs] def is_dmy_variable(field_name: str) -> bool: """Return True if *field_name* is known to use D/M (day-first) date order. Checks against the canonical ``DMY_VARIABLES`` frozenset defined in this module. The comparison is exact (case-sensitive) because the variable names come from Excel headers. """ return field_name in DMY_VARIABLES
# ============================================================================ # Core parser # ============================================================================ def _expand_year(y: int) -> int: """Expand a 2-digit year: <50 → 2000+, ≥50 → 1900+. 4-digit unchanged.""" if y < 100: return 2000 + y if y < 50 else 1900 + y return y
[docs] def parse_date(value: str, *, field_name: str | None = None) -> ParsedDate | None: """Parse a date/datetime text string into a :class:`ParsedDate`. Args: value: The raw text string (e.g. ``"7/28/14"``, ``"28/05/2014"``, ``"2014-07-28 00:00:00"``). field_name: The column/variable name the value came from. Used to determine M/D vs D/M order for slash-delimited dates. If ``None``, defaults to M/D (US-style). Returns: A :class:`ParsedDate` on success, or ``None`` if the string cannot be parsed as a date. Examples:: >>> parse_date("7/28/14") ParsedDate(dt=datetime(2014, 7, 28), ..., format='mdy', ...) >>> parse_date("28/05/2014", field_name="IC_VISDAT") ParsedDate(dt=datetime(2014, 5, 28), ..., format='dmy', ...) >>> parse_date("2014-07-28 00:00:00") ParsedDate(dt=datetime(2014, 7, 28), ..., format='iso', ...) """ value = value.strip() if not value: return None # ── Try ISO: YYYY-MM-DD [HH:MM:SS] ── m = _ISO_RE.match(value) if m: y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3)) try: dt = datetime(y, mo, d) except (ValueError, OverflowError): return None return ParsedDate( dt=dt, has_time=m.group(4) is not None, ampm=None, format="iso", original=value, ) # ── Try slash-delimited: A/B/C [H:M:S [AM/PM]] ── m = _SLASH_RE.match(value) if m: g1, g2, g3 = int(m.group(1)), int(m.group(2)), int(m.group(3)) y = _expand_year(g3) # Decide month/day order based on field name dmy = field_name is not None and is_dmy_variable(field_name) if dmy: # D/M/Y: group1=day, group2=month day, month = g1, g2 fmt = "dmy" else: # M/D/Y: group1=month, group2=day (default) month, day = g1, g2 fmt = "mdy" try: dt = datetime(y, month, day) except (ValueError, OverflowError): return None ampm = m.group(7).upper() if m.group(7) else None return ParsedDate( dt=dt, has_time=m.group(4) is not None, ampm=ampm, format=fmt, original=value, ) return None
# ============================================================================ # Date detection # ============================================================================
[docs] def value_looks_like_date(value: str) -> bool: """Return True if *value* looks like a date/datetime string. This is a quick check — it does NOT validate the date components. """ v = value.strip() return bool(_ISO_DETECT_RE.match(v) or _SLASH_DETECT_RE.match(v))