sunlabuiuc · 17mengmel · Mar 30, 2026 · Mar 30, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -234,6 +234,7 @@ Available Datasets
     datasets/pyhealth.datasets.SHHSDataset
     datasets/pyhealth.datasets.SleepEDFDataset
     datasets/pyhealth.datasets.EHRShotDataset
+    datasets/pyhealth.datasets.HiRIDDataset
     datasets/pyhealth.datasets.Support2Dataset
     datasets/pyhealth.datasets.BMDHSDataset
     datasets/pyhealth.datasets.COVID19CXRDataset

diff --git a/docs/api/datasets/pyhealth.datasets.HiRIDDataset.rst b/docs/api/datasets/pyhealth.datasets.HiRIDDataset.rst
@@ -0,0 +1,9 @@
+pyhealth.datasets.HiRIDDataset
+===================================
+
+The HiRID (High time Resolution ICU Dataset) contains ~34,000 ICU admissions from Bern University Hospital with high-resolution time-series data. Refer to `PhysioNet <https://physionet.org/content/hirid/1.1.1/>`_ for more information.
+
+.. autoclass:: pyhealth.datasets.HiRIDDataset
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/api/tasks.rst b/docs/api/tasks.rst
@@ -223,6 +223,7 @@ Available Tasks
     Temple University EEG Tasks <tasks/pyhealth.tasks.temple_university_EEG_tasks>
     Sleep Staging v2 <tasks/pyhealth.tasks.sleep_staging_v2>
     Benchmark EHRShot <tasks/pyhealth.tasks.benchmark_ehrshot>
+    FAMEWS Fairness Audit <tasks/pyhealth.tasks.FAMEWS_fairness_audit>
     ChestX-ray14 Binary Classification <tasks/pyhealth.tasks.ChestXray14BinaryClassification>
     De-Identification NER <tasks/pyhealth.tasks.DeIDNERTask>
     ChestX-ray14 Multilabel Classification <tasks/pyhealth.tasks.ChestXray14MultilabelClassification>

diff --git a/docs/api/tasks/pyhealth.tasks.FAMEWS_fairness_audit.rst b/docs/api/tasks/pyhealth.tasks.FAMEWS_fairness_audit.rst
@@ -0,0 +1,7 @@
+pyhealth.tasks.FAMEWS_fairness_audit
+===================================
+
+.. autoclass:: pyhealth.tasks.FAMEWS_fairness_audit.FAMEWSFairnessAudit
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/examples/HiRID_fairness_audit.ipynb b/examples/HiRID_fairness_audit.ipynb
@@ -0,0 +1,178 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "edaf0cae",
+   "metadata": {},
+   "source": [
+    "# FAMEWS Fairness Audit (Quick Start)\n",
+    "\n",
+    "This notebook shows a minimal end-to-end example for:\n",
+    "\n",
+    "1. Loading `HiRIDDataset`\n",
+    "2. Running `FAMEWSFairnessAudit` on a patient\n",
+    "3. Building a `SampleDataset` with `set_task(...)`\n",
+    "4. Printing a quick subgroup summary (sex and age group)\n",
+    "\n",
+    "Use `dev=True` for a fast smoke test, then switch to `dev=False` for full runs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e33e0fa7",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mRunning cells with 'Python 3.12.7' requires the ipykernel package.\n",
+      "\u001b[1;31m<a href='command:jupyter.createPythonEnvAndSelectController'>Create a Python Environment</a> with the required packages.\n",
+      "\u001b[1;31mOr install 'ipykernel' using the command: '/opt/homebrew/bin/python3.12 -m pip install ipykernel -U --user --force-reinstall'"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import Counter\n",
+    "from pathlib import Path\n",
+    "import sys\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "def find_repo_root(start: Path) -> Path:\n",
+    "    for path in [start, *start.parents]:\n",
+    "        if (path / \"pyproject.toml\").exists() and (path / \"pyhealth\").exists():\n",
+    "            return path\n",
+    "    raise FileNotFoundError(\"Could not locate the PyHealth repository root from the current working directory.\")\n",
+    "\n",
+    "REPO_ROOT = find_repo_root(Path.cwd().resolve())\n",
+    "for name in list(sys.modules):\n",
+    "    if name == \"pyhealth\" or name.startswith(\"pyhealth.\"):\n",
+    "        del sys.modules[name]\n",
+    "if str(REPO_ROOT) in sys.path:\n",
+    "    sys.path.remove(str(REPO_ROOT))\n",
+    "sys.path.insert(0, str(REPO_ROOT))\n",
+    "\n",
+    "from pyhealth.datasets.hirid import HiRIDDataset\n",
+    "from pyhealth.tasks.HiRID_fairness_audit import FAMEWSFairnessAudit\n",
+    "\n",
+    "HIRID_ROOT = REPO_ROOT / \"test-resources\" / \"core\" / \"hiriddemo\"\n",
+    "\n",
+    "assert HIRID_ROOT.exists(), (\n",
+    "    f\"Expected HiRID root at {HIRID_ROOT}, but it was not found.\"\n",
+    ")\n",
+    "\n",
+    "print(f\"Using HiRID root: {HIRID_ROOT}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ad8a1ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Quick setup: use imputed stage + dev mode for fast iteration.\n",
+    "dataset = HiRIDDataset(\n",
+    "    root=str(HIRID_ROOT),\n",
+    "    stage=\"imputed\",\n",
+    "    dev=True,\n",
+    ")\n",
+    "\n",
+    "task = FAMEWSFairnessAudit(stage_table=\"imputed_stage\")\n",
+    "\n",
+    "# Inspect one raw task sample from a single patient.\n",
+    "first_pid = dataset.unique_patient_ids[0]\n",
+    "first_patient = dataset.get_patient(first_pid)\n",
+    "raw_samples = task(first_patient)\n",
+    "\n",
+    "print(f\"First patient id: {first_pid}\")\n",
+    "print(f\"Raw samples generated for first patient: {len(raw_samples)}\")\n",
+    "\n",
+    "if raw_samples:\n",
+    "    raw0 = raw_samples[0]\n",
+    "    print(\"Raw sample keys:\", sorted(raw0.keys()))\n",
+    "    print(\"Demographics:\", {\n",
+    "        \"sex\": raw0.get(\"sex\"),\n",
+    "        \"age\": raw0.get(\"age\"),\n",
+    "        \"age_group\": raw0.get(\"age_group\"),\n",
+    "        \"discharge_status\": raw0.get(\"discharge_status\"),\n",
+    "    })\n",
+    "\n",
+    "    ts, values = raw0[\"signals\"]\n",
+    "    print(\"Timeseries length:\", len(ts))\n",
+    "    print(\"Values shape:\", values.shape)\n",
+    "\n",
+    "    display(pd.DataFrame(values[:5], columns=raw0[\"feature_columns\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78981b54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build a PyHealth SampleDataset (this applies processors and caches outputs).\n",
+    "sample_dataset = dataset.set_task(task, num_workers=1)\n",
+    "\n",
+    "print(\"Dataset stats:\")\n",
+    "dataset.stats()\n",
+    "print(f\"\\nNumber of processed ML samples: {len(sample_dataset)}\")\n",
+    "\n",
+    "sample0 = sample_dataset[0]\n",
+    "print(\"SampleDataset keys:\", sorted(sample0.keys()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98afa328",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Quick fairness-oriented summary over a small cohort slice.\n",
+    "# We intentionally use raw task outputs here to keep metadata fields explicit.\n",
+    "max_patients = 200\n",
+    "sex_counter = Counter()\n",
+    "age_group_counter = Counter()\n",
+    "valid_patients = 0\n",
+    "\n",
+    "for pid in dataset.unique_patient_ids[:max_patients]:\n",
+    "    p = dataset.get_patient(pid)\n",
+    "    samples = task(p)\n",
+    "    if not samples:\n",
+    "        continue\n",
+    "    valid_patients += 1\n",
+    "    s = samples[0]\n",
+    "    sex_counter[str(s.get(\"sex\"))] += 1\n",
+    "    age_group_counter[str(s.get(\"age_group\"))] += 1\n",
+    "\n",
+    "print(f\"Patients scanned: {max_patients}\")\n",
+    "print(f\"Patients with at least one task sample: {valid_patients}\")\n",
+    "\n",
+    "summary_df = pd.DataFrame({\n",
+    "    \"sex\": dict(sex_counter),\n",
+    "    \"age_group\": dict(age_group_counter),\n",
+    "})\n",
+    "\n",
+    "display(summary_df.fillna(0).astype(int))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py
@@ -55,6 +55,7 @@ def __init__(self, *args, **kwargs):
 from .dreamt import DREAMTDataset
 from .ehrshot import EHRShotDataset
 from .eicu import eICUDataset
+from .hirid import HiRIDDataset
 from .isruc import ISRUCDataset
 from .medical_transcriptions import MedicalTranscriptionsDataset
 from .mimic3 import MIMIC3Dataset

diff --git a/pyhealth/datasets/configs/hirid.yaml b/pyhealth/datasets/configs/hirid.yaml
@@ -0,0 +1,89 @@
+version: "1.1.1"
+tables:
+  general_table:
+    file_path: "general_table.csv"
+    patient_id: "patientid"
+    timestamp: "admissiontime"
+    attributes:
+      - "sex"
+      - "age"
+      - "discharge_status"
+
+  merged_stage:
+    file_path: "hirid-merged-pyhealth.csv"
+    patient_id: "patientid"
+    timestamp: "datetime"
+    attributes:
+      - "heart_rate"
+      - "systolic_bp_invasive"
+      - "diastolic_bp_invasive"
+      - "mean_arterial_pressure"
+      - "cardiac_output"
+      - "spo2"
+      - "rass"
+      - "peak_inspiratory_pressure"
+      - "lactate_arterial"
+      - "lactate_venous"
+      - "inr"
+      - "serum_glucose"
+      - "c_reactive_protein"
+      - "dobutamine"
+      - "milrinone"
+      - "levosimendan"
+      - "theophyllin"
+      - "non_opioid_analgesics"
+
+  observation_tables:
+    file_path: "hirid-observations-pyhealth.csv"
+    patient_id: "patientid"
+    timestamp: "datetime"
+    attributes:
+      - "entertime"
+      - "variableid"
+      - "value"
+      - "status"
+      - "stringvalue"
+      - "type"
+
+  pharma_records:
+    file_path: "hirid-pharma-pyhealth.csv"
+    patient_id: "patientid"
+    timestamp: "givenat"
+    attributes:
+      - "pharmaid"
+      - "enteredentryat"
+      - "givendose"
+      - "cumulativedose"
+      - "fluidamount_calc"
+      - "cumulfluidamount_calc"
+      - "doseunit"
+      - "route"
+      - "infusionid"
+      - "typeid"
+      - "subtypeid"
+      - "recordstatus"
+
+  imputed_stage:
+    file_path: "hirid-imputed-pyhealth.csv"
+    patient_id: "patientid"
+    timestamp: null
+    attributes:
+      - "reldatetime"
+      - "heart_rate"
+      - "systolic_bp_invasive"
+      - "diastolic_bp_invasive"
+      - "mean_arterial_pressure"
+      - "cardiac_output"
+      - "spo2"
+      - "rass"
+      - "peak_inspiratory_pressure"
+      - "lactate_arterial"
+      - "lactate_venous"
+      - "inr"
+      - "serum_glucose"
+      - "c_reactive_protein"
+      - "dobutamine"
+      - "milrinone"
+      - "levosimendan"
+      - "theophyllin"
+      - "non_opioid_analgesics"