"""Variable-aware date parsing for the Indo-VAP clinical dataset.
The Indo-VAP Excel sheets store dates in **three** distinct ways:
1. **Excel datetime cells** — openpyxl / pandas parse these into Python
``datetime`` objects automatically. No ambiguity.
2. **Slash-delimited text strings** — stored as plain text in the cell.
The date *order* (month-first vs day-first) **varies per variable**:
* **Most variables** use US-style **M/D/YYYY** or **M/D/YY**
(e.g. ``"08/12/2014 12:27:48 PM"``, ``"7/28/14"``).
* **Six specific variables** use Indian-style **D/M/YYYY** or **D/M/YY**
(e.g. ``IC_VISDAT="28/05/2014"``, ``IT_IGRADAT="12/12/12"``).
The canonical set of day-first variables is maintained in
:data:`DMY_VARIABLES` below.
3. **ISO datetime strings** — ``"2014-07-28"`` or ``"2014-07-28 00:00:00"``.
Unambiguous; year-month-day order.
This module provides:
* :func:`parse_date` — parse any of the above into a ``datetime``.
* :func:`value_looks_like_date` — quick check for date-like strings.
* :func:`is_dmy_variable` — check if a variable uses D/M order.
All functions are pure (no side effects) and safe to call from any module.
Generated by scanning all 44 raw Excel files (2026-03-25).
"""
from __future__ import annotations
import re
from datetime import datetime
from typing import NamedTuple
__all__ = [
"DMY_VARIABLES",
"ParsedDate",
"is_dmy_variable",
"parse_date",
"value_looks_like_date",
]
# ============================================================================
# Per-variable date order — D/M (day-first) variables
# ============================================================================
# The Indo-VAP Excel sheets are inconsistent: most text-string dates use
# US-style M/D/YYYY (month first), but a handful of variables use Indian-style
# D/M/YYYY (day first). We determined this by scanning ALL rows in ALL 44
# Excel files: if any cell in a column has group1 > 12, that column MUST be
# day-first. The 6 confirmed D/M variables are listed below.
#
# All other text-string date variables use M/D order (US-style) or are
# ambiguous (both parts ≤ 12) — those default to M/D.
#
# Excel-datetime columns (openpyxl parses to Python datetime) have no
# ambiguity — they are already typed objects.
#
# Verified by scanning all 44 raw Excel files (2026-03-25).
DMY_VARIABLES: frozenset[str] = frozenset(
{
"CBC_HBADAT", # 5_CBC.xlsx e.g. "25/09/2017"
"CC_VISDAT", # 14_Case_Control.xlsx e.g. "25/08/2015"
"FOA_VISDAT", # 98A_FOA.xlsx e.g. "19/02/2015"
"FOB_VISDAT", # 98B_FOB.xlsx e.g. "24/07/2015"
"IC_VISDAT", # 2A_ICBaseline.xlsx e.g. "28/05/2014"
"IT_IGRADAT", # 11_IGRA.xlsx e.g. "12/12/12" (D/M/YY, 2-digit year)
}
)
"""Variable names whose slash-date text strings use D/M/YYYY (day first, Indian).
All other slash-date variables default to M/D/YYYY (month first, US).
"""
# ============================================================================
# Compiled regexes
# ============================================================================
# ISO: YYYY-MM-DD [HH:MM:SS]
_ISO_RE = re.compile(
r"^((?:19|20)\d{2})-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])"
r"(?:\s+(\d{2}):(\d{2}):(\d{2}))?$"
)
# Slash-delimited: A/B/C [H:M:S [AM|PM]]
# Groups: (1)=first, (2)=second, (3)=year, (4)=hour, (5)=min, (6)=sec, (7)=AM/PM
_SLASH_RE = re.compile(
r"^(\d{1,2})/(\d{1,2})/(\d{2,4})"
r"(?:\s+(\d{1,2}):(\d{2}):(\d{2})(?:\s*([AP]M))?)?$",
re.I,
)
# Quick detection: matches any slash-date (M/D or D/M) with optional time
_SLASH_DETECT_RE = re.compile(
r"^\d{1,2}/\d{1,2}/\d{2,4}(?:\s+\d{1,2}:\d{2}:\d{2}(?:\s*[AP]M)?)?$",
re.I,
)
# Quick detection: ISO date
_ISO_DETECT_RE = re.compile(
r"^(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])"
r"(?:\s+\d{2}:\d{2}:\d{2})?$"
)
# ============================================================================
# Data structures
# ============================================================================
[docs]
class ParsedDate(NamedTuple):
"""Result of parsing a date string."""
dt: datetime
"""The parsed datetime. Time component is always ``00:00:00``; the original
string's time is intentionally discarded for date-granularity anonymization
(see ``has_time`` and ``ampm`` for source-layout metadata)."""
has_time: bool
"""True if the original string included a time component."""
ampm: str | None
"""'AM' or 'PM' if present in the original, else None."""
format: str
"""One of 'iso', 'mdy', 'dmy' indicating the detected source format."""
original: str
"""The original (stripped) string that was parsed."""
# ============================================================================
# Variable-name lookup
# ============================================================================
[docs]
def is_dmy_variable(field_name: str) -> bool:
"""Return True if *field_name* is known to use D/M (day-first) date order.
Checks against the canonical ``DMY_VARIABLES`` frozenset defined in
this module. The comparison is exact (case-sensitive) because the
variable names come from Excel headers.
"""
return field_name in DMY_VARIABLES
# ============================================================================
# Core parser
# ============================================================================
def _expand_year(y: int) -> int:
"""Expand a 2-digit year: <50 → 2000+, ≥50 → 1900+. 4-digit unchanged."""
if y < 100:
return 2000 + y if y < 50 else 1900 + y
return y
[docs]
def parse_date(value: str, *, field_name: str | None = None) -> ParsedDate | None:
"""Parse a date/datetime text string into a :class:`ParsedDate`.
Args:
value: The raw text string (e.g. ``"7/28/14"``, ``"28/05/2014"``,
``"2014-07-28 00:00:00"``).
field_name: The column/variable name the value came from. Used to
determine M/D vs D/M order for slash-delimited dates.
If ``None``, defaults to M/D (US-style).
Returns:
A :class:`ParsedDate` on success, or ``None`` if the string cannot
be parsed as a date.
Examples::
>>> parse_date("7/28/14")
ParsedDate(dt=datetime(2014, 7, 28), ..., format='mdy', ...)
>>> parse_date("28/05/2014", field_name="IC_VISDAT")
ParsedDate(dt=datetime(2014, 5, 28), ..., format='dmy', ...)
>>> parse_date("2014-07-28 00:00:00")
ParsedDate(dt=datetime(2014, 7, 28), ..., format='iso', ...)
"""
value = value.strip()
if not value:
return None
# ── Try ISO: YYYY-MM-DD [HH:MM:SS] ──
m = _ISO_RE.match(value)
if m:
y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3))
try:
dt = datetime(y, mo, d)
except (ValueError, OverflowError):
return None
return ParsedDate(
dt=dt,
has_time=m.group(4) is not None,
ampm=None,
format="iso",
original=value,
)
# ── Try slash-delimited: A/B/C [H:M:S [AM/PM]] ──
m = _SLASH_RE.match(value)
if m:
g1, g2, g3 = int(m.group(1)), int(m.group(2)), int(m.group(3))
y = _expand_year(g3)
# Decide month/day order based on field name
dmy = field_name is not None and is_dmy_variable(field_name)
if dmy:
# D/M/Y: group1=day, group2=month
day, month = g1, g2
fmt = "dmy"
else:
# M/D/Y: group1=month, group2=day (default)
month, day = g1, g2
fmt = "mdy"
try:
dt = datetime(y, month, day)
except (ValueError, OverflowError):
return None
ampm = m.group(7).upper() if m.group(7) else None
return ParsedDate(
dt=dt,
has_time=m.group(4) is not None,
ampm=ampm,
format=fmt,
original=value,
)
return None
# ============================================================================
# Date detection
# ============================================================================
[docs]
def value_looks_like_date(value: str) -> bool:
"""Return True if *value* looks like a date/datetime string.
This is a quick check — it does NOT validate the date components.
"""
v = value.strip()
return bool(_ISO_DETECT_RE.match(v) or _SLASH_DETECT_RE.match(v))