Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ Available Datasets
datasets/pyhealth.datasets.SHHSDataset
datasets/pyhealth.datasets.SleepEDFDataset
datasets/pyhealth.datasets.EHRShotDataset
datasets/pyhealth.datasets.HiRIDDataset
datasets/pyhealth.datasets.Support2Dataset
datasets/pyhealth.datasets.BMDHSDataset
datasets/pyhealth.datasets.COVID19CXRDataset
Expand Down
9 changes: 9 additions & 0 deletions docs/api/datasets/pyhealth.datasets.HiRIDDataset.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
pyhealth.datasets.HiRIDDataset
===================================

The HiRID (High time Resolution ICU Dataset) contains ~34,000 ICU admissions from Bern University Hospital with high-resolution time-series data. Refer to `PhysioNet <https://physionet.org/content/hirid/1.1.1/>`_ for more information.

.. autoclass:: pyhealth.datasets.HiRIDDataset
:members:
:undoc-members:
:show-inheritance:
1 change: 1 addition & 0 deletions docs/api/tasks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ Available Tasks
Temple University EEG Tasks <tasks/pyhealth.tasks.temple_university_EEG_tasks>
Sleep Staging v2 <tasks/pyhealth.tasks.sleep_staging_v2>
Benchmark EHRShot <tasks/pyhealth.tasks.benchmark_ehrshot>
FAMEWS Fairness Audit <tasks/pyhealth.tasks.FAMEWS_fairness_audit>
ChestX-ray14 Binary Classification <tasks/pyhealth.tasks.ChestXray14BinaryClassification>
De-Identification NER <tasks/pyhealth.tasks.DeIDNERTask>
ChestX-ray14 Multilabel Classification <tasks/pyhealth.tasks.ChestXray14MultilabelClassification>
Expand Down
7 changes: 7 additions & 0 deletions docs/api/tasks/pyhealth.tasks.FAMEWS_fairness_audit.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
pyhealth.tasks.FAMEWS_fairness_audit
===================================

.. autoclass:: pyhealth.tasks.FAMEWS_fairness_audit.FAMEWSFairnessAudit
:members:
:undoc-members:
:show-inheritance:
178 changes: 178 additions & 0 deletions examples/HiRID_fairness_audit.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "edaf0cae",
"metadata": {},
"source": [
"# FAMEWS Fairness Audit (Quick Start)\n",
"\n",
"This notebook shows a minimal end-to-end example for:\n",
"\n",
"1. Loading `HiRIDDataset`\n",
"2. Running `FAMEWSFairnessAudit` on a patient\n",
"3. Building a `SampleDataset` with `set_task(...)`\n",
"4. Printing a quick subgroup summary (sex and age group)\n",
"\n",
"Use `dev=True` for a fast smoke test, then switch to `dev=False` for full runs."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e33e0fa7",
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mRunning cells with 'Python 3.12.7' requires the ipykernel package.\n",
"\u001b[1;31m<a href='command:jupyter.createPythonEnvAndSelectController'>Create a Python Environment</a> with the required packages.\n",
"\u001b[1;31mOr install 'ipykernel' using the command: '/opt/homebrew/bin/python3.12 -m pip install ipykernel -U --user --force-reinstall'"
]
}
],
"source": [
"from collections import Counter\n",
"from pathlib import Path\n",
"import sys\n",
"\n",
"import pandas as pd\n",
"\n",
"def find_repo_root(start: Path) -> Path:\n",
" for path in [start, *start.parents]:\n",
" if (path / \"pyproject.toml\").exists() and (path / \"pyhealth\").exists():\n",
" return path\n",
" raise FileNotFoundError(\"Could not locate the PyHealth repository root from the current working directory.\")\n",
"\n",
"REPO_ROOT = find_repo_root(Path.cwd().resolve())\n",
"for name in list(sys.modules):\n",
" if name == \"pyhealth\" or name.startswith(\"pyhealth.\"):\n",
" del sys.modules[name]\n",
"if str(REPO_ROOT) in sys.path:\n",
" sys.path.remove(str(REPO_ROOT))\n",
"sys.path.insert(0, str(REPO_ROOT))\n",
"\n",
"from pyhealth.datasets.hirid import HiRIDDataset\n",
"from pyhealth.tasks.HiRID_fairness_audit import FAMEWSFairnessAudit\n",
"\n",
"HIRID_ROOT = REPO_ROOT / \"test-resources\" / \"core\" / \"hiriddemo\"\n",
"\n",
"assert HIRID_ROOT.exists(), (\n",
" f\"Expected HiRID root at {HIRID_ROOT}, but it was not found.\"\n",
")\n",
"\n",
"print(f\"Using HiRID root: {HIRID_ROOT}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9ad8a1ef",
"metadata": {},
"outputs": [],
"source": [
"# Quick setup: use imputed stage + dev mode for fast iteration.\n",
"dataset = HiRIDDataset(\n",
" root=str(HIRID_ROOT),\n",
" stage=\"imputed\",\n",
" dev=True,\n",
")\n",
"\n",
"task = FAMEWSFairnessAudit(stage_table=\"imputed_stage\")\n",
"\n",
"# Inspect one raw task sample from a single patient.\n",
"first_pid = dataset.unique_patient_ids[0]\n",
"first_patient = dataset.get_patient(first_pid)\n",
"raw_samples = task(first_patient)\n",
"\n",
"print(f\"First patient id: {first_pid}\")\n",
"print(f\"Raw samples generated for first patient: {len(raw_samples)}\")\n",
"\n",
"if raw_samples:\n",
" raw0 = raw_samples[0]\n",
" print(\"Raw sample keys:\", sorted(raw0.keys()))\n",
" print(\"Demographics:\", {\n",
" \"sex\": raw0.get(\"sex\"),\n",
" \"age\": raw0.get(\"age\"),\n",
" \"age_group\": raw0.get(\"age_group\"),\n",
" \"discharge_status\": raw0.get(\"discharge_status\"),\n",
" })\n",
"\n",
" ts, values = raw0[\"signals\"]\n",
" print(\"Timeseries length:\", len(ts))\n",
" print(\"Values shape:\", values.shape)\n",
"\n",
" display(pd.DataFrame(values[:5], columns=raw0[\"feature_columns\"]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "78981b54",
"metadata": {},
"outputs": [],
"source": [
"# Build a PyHealth SampleDataset (this applies processors and caches outputs).\n",
"sample_dataset = dataset.set_task(task, num_workers=1)\n",
"\n",
"print(\"Dataset stats:\")\n",
"dataset.stats()\n",
"print(f\"\\nNumber of processed ML samples: {len(sample_dataset)}\")\n",
"\n",
"sample0 = sample_dataset[0]\n",
"print(\"SampleDataset keys:\", sorted(sample0.keys()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "98afa328",
"metadata": {},
"outputs": [],
"source": [
"# Quick fairness-oriented summary over a small cohort slice.\n",
"# We intentionally use raw task outputs here to keep metadata fields explicit.\n",
"max_patients = 200\n",
"sex_counter = Counter()\n",
"age_group_counter = Counter()\n",
"valid_patients = 0\n",
"\n",
"for pid in dataset.unique_patient_ids[:max_patients]:\n",
" p = dataset.get_patient(pid)\n",
" samples = task(p)\n",
" if not samples:\n",
" continue\n",
" valid_patients += 1\n",
" s = samples[0]\n",
" sex_counter[str(s.get(\"sex\"))] += 1\n",
" age_group_counter[str(s.get(\"age_group\"))] += 1\n",
"\n",
"print(f\"Patients scanned: {max_patients}\")\n",
"print(f\"Patients with at least one task sample: {valid_patients}\")\n",
"\n",
"summary_df = pd.DataFrame({\n",
" \"sex\": dict(sex_counter),\n",
" \"age_group\": dict(age_group_counter),\n",
"})\n",
"\n",
"display(summary_df.fillna(0).astype(int))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
1 change: 1 addition & 0 deletions pyhealth/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def __init__(self, *args, **kwargs):
from .dreamt import DREAMTDataset
from .ehrshot import EHRShotDataset
from .eicu import eICUDataset
from .hirid import HiRIDDataset
from .isruc import ISRUCDataset
from .medical_transcriptions import MedicalTranscriptionsDataset
from .mimic3 import MIMIC3Dataset
Expand Down
89 changes: 89 additions & 0 deletions pyhealth/datasets/configs/hirid.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
version: "1.1.1"
tables:
general_table:
file_path: "general_table.csv"
patient_id: "patientid"
timestamp: "admissiontime"
attributes:
- "sex"
- "age"
- "discharge_status"

merged_stage:
file_path: "hirid-merged-pyhealth.csv"
patient_id: "patientid"
timestamp: "datetime"
attributes:
- "heart_rate"
- "systolic_bp_invasive"
- "diastolic_bp_invasive"
- "mean_arterial_pressure"
- "cardiac_output"
- "spo2"
- "rass"
- "peak_inspiratory_pressure"
- "lactate_arterial"
- "lactate_venous"
- "inr"
- "serum_glucose"
- "c_reactive_protein"
- "dobutamine"
- "milrinone"
- "levosimendan"
- "theophyllin"
- "non_opioid_analgesics"

observation_tables:
file_path: "hirid-observations-pyhealth.csv"
patient_id: "patientid"
timestamp: "datetime"
attributes:
- "entertime"
- "variableid"
- "value"
- "status"
- "stringvalue"
- "type"

pharma_records:
file_path: "hirid-pharma-pyhealth.csv"
patient_id: "patientid"
timestamp: "givenat"
attributes:
- "pharmaid"
- "enteredentryat"
- "givendose"
- "cumulativedose"
- "fluidamount_calc"
- "cumulfluidamount_calc"
- "doseunit"
- "route"
- "infusionid"
- "typeid"
- "subtypeid"
- "recordstatus"

imputed_stage:
file_path: "hirid-imputed-pyhealth.csv"
patient_id: "patientid"
timestamp: null
attributes:
- "reldatetime"
- "heart_rate"
- "systolic_bp_invasive"
- "diastolic_bp_invasive"
- "mean_arterial_pressure"
- "cardiac_output"
- "spo2"
- "rass"
- "peak_inspiratory_pressure"
- "lactate_arterial"
- "lactate_venous"
- "inr"
- "serum_glucose"
- "c_reactive_protein"
- "dobutamine"
- "milrinone"
- "levosimendan"
- "theophyllin"
- "non_opioid_analgesics"
Loading