diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst index 8d9a59d21..96cdab284 100644 --- a/docs/api/datasets.rst +++ b/docs/api/datasets.rst @@ -244,5 +244,6 @@ Available Datasets datasets/pyhealth.datasets.ClinVarDataset datasets/pyhealth.datasets.COSMICDataset datasets/pyhealth.datasets.TCGAPRADDataset + datasetes/pyhealth.datasets.DailyAndSportActivities datasets/pyhealth.datasets.splitter datasets/pyhealth.datasets.utils diff --git a/docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst b/docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst new file mode 100644 index 000000000..b588d91ec --- /dev/null +++ b/docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst @@ -0,0 +1,9 @@ +pyhealth.datasets.DailyAndSportActivitiesDataset +======================================== + +The Daily and Sport Activities dataset. For more information see `here `_. + +.. autoclass: pyhealth.datasets.DailyAndSportActivitiesDataset + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/api/tasks.rst b/docs/api/tasks.rst index 23a4e06e5..115cf6667 100644 --- a/docs/api/tasks.rst +++ b/docs/api/tasks.rst @@ -230,3 +230,4 @@ Available Tasks Mutation Pathogenicity (COSMIC) Cancer Survival Prediction (TCGA) Cancer Mutation Burden (TCGA) + Daily and Sport Activities diff --git a/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst b/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst new file mode 100644 index 000000000..1461c30db --- /dev/null +++ b/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst @@ -0,0 +1,7 @@ +pyhealth.tasks.DailyAndSportActivitiesClassification +=============================================== + +.. autoclass:: pyhealth.tasks.DailyAndSportActivitiesClassification + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/examples/daily_sport_activities.ipynb b/examples/daily_sport_activities.ipynb new file mode 100644 index 000000000..c5cce666b --- /dev/null +++ b/examples/daily_sport_activities.ipynb @@ -0,0 +1,358 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3d4f2a5e", + "metadata": {}, + "source": [ + "# Daily And Sport Activities RNN Ablation Study\n", + "\n", + "This notebook demonstrates an ablation study for the Daily and Sports Activities dataset using a built-in PyHealth RNN model.\n", + "\n", + "## Goal\n", + "Evaluate how task configuration affects downstream multiclass activity recognition performance.\n", + "\n", + "## Ablation configurations\n", + "1. `window_size=25, stride=10, normalize=True`\n", + "2. `window_size=50, stride=25, normalize=True`\n", + "3. `window_size=50, stride=25, normalize=False`\n", + "\n", + "## Workflow\n", + "- Build a small synthetic dataset in the real folder format: `aXX/pY/sZZ.txt`\n", + "- Create PyHealth task datasets under multiple task settings\n", + "- Train/evaluate a built-in PyHealth RNN\n", + "- Compare test accuracy across configurations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceac2c74", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import tempfile\n", + "import random\n", + "\n", + "import numpy as np\n", + "\n", + "from pyhealth.datasets import DailyAndSportActivitiesDataset\n", + "from pyhealth.tasks import DailyAndSportActivitiesClassification\n", + "from pyhealth.datasets import split_by_patient, get_dataloader\n", + "from pyhealth.models import RNN\n", + "from pyhealth.trainer import Trainer\n", + "\n", + "random.seed(123)\n", + "np.random.seed(123)" + ] + }, + { + "cell_type": "markdown", + "id": "1c74bc91", + "metadata": {}, + "source": [ + "## Synthetic data generation\n", + "\n", + "We create synthetic segment files in the same folder structure as the real dataset:\n", + "\n", + "- `a01`, `a02`, `a03` for activities\n", + "- `p1`, `p2`, ... for subjects\n", + "- `s01.txt`, `s02.txt`, ... for segments\n", + "\n", + "Each file is a `125 x 45` matrix:\n", + "- 125 rows = 5 seconds × 25 Hz\n", + "- 45 columns = 5 body units × 9 sensor axes\n", + "\n", + "We inject class-specific structure so the model has a signal to learn." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceae0a10", + "metadata": {}, + "outputs": [], + "source": [ + "def _write_fake_signal_file(\n", + " file_path: Path,\n", + " activity_idx: int,\n", + " shape=(125, 45),\n", + " seed: int = 0,\n", + ") -> None:\n", + " rng = np.random.default_rng(seed)\n", + " signal = rng.normal(loc=0.0, scale=1.0, size=shape).astype(np.float32)\n", + "\n", + " # Inject class-dependent structure\n", + " start_col = activity_idx * 3\n", + " end_col = min(start_col + 3, shape[1])\n", + " signal[:, start_col:end_col] += 2.5\n", + "\n", + " # Add a mild temporal trend by class\n", + " t = np.linspace(0, 1, shape[0], dtype=np.float32)[:, None]\n", + " signal[:, start_col:end_col] += (activity_idx + 1) * 0.5 * t\n", + "\n", + " file_path.parent.mkdir(parents=True, exist_ok=True)\n", + " np.savetxt(file_path, signal, delimiter=\",\", fmt=\"%.6f\")\n", + "\n", + "\n", + "def build_synthetic_dataset(root: Path) -> None:\n", + " activities = [\"a01\", \"a02\", \"a03\"]\n", + " subjects = [f\"p{i}\" for i in range(1, 7)]\n", + " segments_per_subject = 4\n", + "\n", + " seed = 0\n", + " for activity_idx, activity in enumerate(activities):\n", + " for subject in subjects:\n", + " for seg in range(1, segments_per_subject + 1):\n", + " file_path = root / activity / subject / f\"s{seg:02d}.txt\"\n", + " _write_fake_signal_file(\n", + " file_path=file_path,\n", + " activity_idx=activity_idx,\n", + " seed=seed,\n", + " )\n", + " seed += 1" + ] + }, + { + "cell_type": "markdown", + "id": "6df68140", + "metadata": {}, + "source": [ + "## Build the synthetic dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3b023a0", + "metadata": {}, + "outputs": [], + "source": [ + "tmpdir = tempfile.TemporaryDirectory()\n", + "root = Path(tmpdir.name) / \"daily_sport_activities\"\n", + "build_synthetic_dataset(root)\n", + "\n", + "root" + ] + }, + { + "cell_type": "markdown", + "id": "08e695fe", + "metadata": {}, + "source": [ + "## Define one ablation run\n", + "\n", + "For each configuration:\n", + "- build a task-specific sample dataset\n", + "- split by patient\n", + "- create dataloaders\n", + "- train a built-in PyHealth RNN\n", + "- evaluate on the test set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "227158ea", + "metadata": {}, + "outputs": [], + "source": [ + "def run_one_config(root: Path, cfg: dict) -> dict:\n", + " dataset = DailyAndSportActivitiesDataset(root=str(root))\n", + " task = DailyAndSportActivitiesClassification(signal_loader=dataset.load_signal, **cfg)\n", + "\n", + " sample_dataset = dataset.set_task(task)\n", + "\n", + " train_ds, val_ds, test_ds = split_by_patient(sample_dataset, [0.6, 0.2, 0.2])\n", + "\n", + " train_loader = get_dataloader(train_ds, batch_size=16, shuffle=True)\n", + " val_loader = get_dataloader(val_ds, batch_size=16, shuffle=False)\n", + " test_loader = get_dataloader(test_ds, batch_size=16, shuffle=False)\n", + "\n", + " model = RNN(\n", + " dataset=sample_dataset,\n", + " embedding_dim=128,\n", + " hidden_dim=64,\n", + " rnn_type=\"GRU\",\n", + " num_layers=1,\n", + " dropout=0.1,\n", + " )\n", + "\n", + " trainer = Trainer(\n", + " model=model,\n", + " metrics=[\"accuracy\"],\n", + " device=\"cpu\",\n", + " enable_logging=False,\n", + " )\n", + "\n", + " trainer.train(\n", + " train_dataloader=train_loader,\n", + " val_dataloader=val_loader,\n", + " epochs=5,\n", + " monitor=\"accuracy\",\n", + " monitor_criterion=\"max\",\n", + " )\n", + "\n", + " scores = trainer.evaluate(test_loader)\n", + "\n", + " return {\n", + " \"config\": cfg,\n", + " \"n_total_samples\": len(sample_dataset),\n", + " \"n_train\": len(train_ds),\n", + " \"n_val\": len(val_ds),\n", + " \"n_test\": len(test_ds),\n", + " \"accuracy\": float(scores[\"accuracy\"]),\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "aa33d665", + "metadata": {}, + "source": [ + "## Define ablation settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fca5285", + "metadata": {}, + "outputs": [], + "source": [ + "configs = [\n", + " {\"window_size\": 25, \"stride\": 10, \"normalize\": True},\n", + " {\"window_size\": 50, \"stride\": 25, \"normalize\": True},\n", + " {\"window_size\": 50, \"stride\": 25, \"normalize\": False},\n", + "]\n", + "\n", + "configs" + ] + }, + { + "cell_type": "markdown", + "id": "8a3d3843", + "metadata": {}, + "source": [ + "## Run the ablation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c5784e2", + "metadata": {}, + "outputs": [], + "source": [ + "results = []\n", + "\n", + "for cfg in configs:\n", + " print(f\"Running config: {cfg}\")\n", + " result = run_one_config(root, cfg)\n", + " results.append(result)\n", + " print(result)\n", + " print(\"-\" * 80)" + ] + }, + { + "cell_type": "markdown", + "id": "d36015e3", + "metadata": {}, + "source": [ + "## Summarize results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df67cb3d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "pd.set_option(\"display.max_colwidth\", None)\n", + "\n", + "results_df = pd.DataFrame(results)\n", + "results_df" + ] + }, + { + "cell_type": "markdown", + "id": "3d194c27", + "metadata": {}, + "source": [ + "## Sort by accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89fe3ef7", + "metadata": {}, + "outputs": [], + "source": [ + "results_df.sort_values(\"accuracy\", ascending=False).reset_index(drop=True)" + ] + }, + { + "cell_type": "markdown", + "id": "eb351598", + "metadata": {}, + "source": [ + "## Brief interpretation\n", + "\n", + "- Larger windows (50) outperform smaller windows (25), likely due to better temporal context.\n", + "- Normalization **decreased** performance in this setup, suggesting the model may already handle raw scale well or synthetic signal structure was distorted.\n", + "- Smaller windows increase sample count but may reduce per-sample information, hurting performance.\n", + "- Note: Results are based on synthetic data; trends illustrate pipeline behavior rather than real-world performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b5a3c92", + "metadata": {}, + "outputs": [], + "source": [ + "best_row = results_df.sort_values(\"accuracy\", ascending=False).iloc[0]\n", + "\n", + "print(\"Best configuration:\")\n", + "print(best_row[\"config\"])\n", + "print(f\"Accuracy: {best_row['accuracy']:.4f}\")\n", + "print(f\"Total task samples: {best_row['n_total_samples']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a4b2cfc", + "metadata": {}, + "outputs": [], + "source": [ + "tmpdir.cleanup()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pyhealth-project", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py index 50b1b3887..7dd25a4c8 100644 --- a/pyhealth/datasets/__init__.py +++ b/pyhealth/datasets/__init__.py @@ -68,6 +68,7 @@ def __init__(self, *args, **kwargs): from .bmd_hs import BMDHSDataset from .support2 import Support2Dataset from .tcga_prad import TCGAPRADDataset +from .daily_sport_activities import DailyAndSportActivitiesDataset from .splitter import ( sample_balanced, split_by_patient, diff --git a/pyhealth/datasets/configs/daily_sport_activities.yaml b/pyhealth/datasets/configs/daily_sport_activities.yaml new file mode 100644 index 000000000..66d0f2bd4 --- /dev/null +++ b/pyhealth/datasets/configs/daily_sport_activities.yaml @@ -0,0 +1,16 @@ +# Author: Sezim Zamirbekova (szami2) and Niam Pattni (npattni2) +version: "1.0" +tables: + daily_sport_activities: + file_path: "" + patient_id: "patient_id" + timestamp: null + attributes: + - "record_id" + - "patient_id" + - "visit_id" + - "activity_id" + - "activity" + - "segment" + - "file_path" + - "signal" \ No newline at end of file diff --git a/pyhealth/datasets/daily_sport_activities.py b/pyhealth/datasets/daily_sport_activities.py new file mode 100644 index 000000000..47fb53ff3 --- /dev/null +++ b/pyhealth/datasets/daily_sport_activities.py @@ -0,0 +1,354 @@ +""" +PyHealth dataset for the Daily and Sports Activity dataset. + +Dataset link: + https://archive.ics.uci.edu/dataset/256/daily+and+sports+activities + +Dataset paper: + Zhang, H.; Zhan, D.; Lin, Y.; He, J.; Zhu, Q.; Shen, Z.-J.; and + Zheng, Z. 2024. Daily Physical Activity Monitoring: Adaptive Learning + from Multi-source Motion Sensor Data. Proceedings of the fifth Conference + on Health, Inference, and Learning, volume 248 of Proceedings of Machine + Learning Research, 39–54. PMLR + +Dataset paper link: + https://raw.githubusercontent.com/mlresearch/v248/main/assets/zhang24a/zhang24a.pdf + +Authors: + Niam Pattni (npattni2@illinois.edu) + Sezim Zamirbekova (szami2@illinois.edu) +""" +from __future__ import annotations + +from pathlib import Path +from typing import Dict, List, Optional, Any + +import io +import zipfile + +import dask.dataframe as dd +import numpy as np +import pandas as pd +import requests + +from pyhealth.datasets import BaseDataset + + +class DailyAndSportActivitiesDataset(BaseDataset): + """ + PyHealth dataset for Daily and Sports Activities data. + + This dataset parses multi-sensor time-series text files into structured + samples suitable for downstream task processing. + + Expected folder layout example: + root/ + a01/ + p1/ + s01.txt + s02.txt + a02/ + p1/ + s01.txt + + Each .txt file is expected to contain numeric sensor values arranged + row-wise over time. aXX represents an activity ID, p* represents a subject ID, + and sXX represents a specific sensor ID. + + Parsed data format: + - "record_id": str + - "patient_id": str + - "visit_id": str + - "activity_id": str + - "activity": int + - "segment_id": str + - "file_path": str + - "signal": np.ndarray # shape: [time_steps, num_features] + + Attributes: + root (str): Root directory of the raw data. + root_path (Path): Root directory of the raw data. + config_path (str): Path to the configuration file. + activities (List[str]): Ordered list of activities by ID, activities[0] == a01. + """ + + activities: List[str] = ["sitting", "standing", "lying on back", + "lying on right side", "ascending stairs", + "descending stairs", "standing still in elevator", + "moving around in elevator", "walking in parking lot", + "walking on 4km/h treadmill 0 incline", + "walking on 4km/h treadmill 15 incline", + "running on 8km/h treadmill", "using stair stepper", + "using cross trainer", "cycling in horizontal position", + "cycling in vertical position", "rowing", "jumping", + "playing basketball"] + + def __init__( + self, + root: str = ".", + config_path: Optional[str] = str( + Path(__file__).parent / "configs" / "daily_sport_activities.yaml" + ), + download: bool = False, + dev: bool = False, + ): + """ + Initializes the Daily and Sports Activities dataset. + + Args: + root (str): Root director of the raw data. Defaults to the current + working directory. + config_path (Optional[str]): Path to the configuration file. + Defaults to "../configs/daily_sport_activities.yaml". + download (bool): Whether to download the dataset or use an existing copy. + Defaults to False. + dev (bool): Configures parent BaseDataset. Defaults to False. + + Raises: + FileNotFoundError: If the dataset cannot be found in the specified + directory. + NotADirectoryError: If the specified root path is not a directory. + """ + self.root_path = Path(root) + self.root = root + + if download: + self._download(self.root) + + if not self.root_path.exists(): + raise FileNotFoundError(f"Dataset root does not exist: {self.root_path}") + + if not self.root_path.is_dir(): + raise NotADirectoryError( + f"Dataset root is not a directory: {self.root_path}" + ) + + super().__init__( + root=self.root, + tables=["daily_sport_activities"], + dataset_name="daily_sport_activities", + config_path=config_path, + dev=dev, + ) + + @property + def default_task(self): + """ + Returns the default task for this dataset. + + Returns: + DailyAndSportActivitiesClassification: The default classification task. + + Example:: + >>> dataset = DailyAndSportActivitiesDataset() + >>> task = dataset.default_task + """ + from pyhealth.tasks import DailyAndSportActivitiesClassification + return DailyAndSportActivitiesClassification(signal_loader=self.load_signal) + + def _download(self, root: str) -> None: + """Downloads the Daily and Sports Activities dataset and extracts the + compressed data. + + Args: + root (str): Root directory of raw data. + Raises: + HTTPError: If the file cannot be downloaded. + """ + + dataset_url = "https://archive.ics.uci.edu/static/public/256/daily+and+sports+activities.zip" + + root_path = Path(root) + root_path.mkdir(parents=True, exist_ok=True) + + response = requests.get(dataset_url, timeout=60) + response.raise_for_status() + + with zipfile.ZipFile(io.BytesIO(response.content)) as zf: + zf.extractall(root_path) + + def _discover_files(self) -> List[Path]: + """ + Find all text files under the dataset root. + + Returns: + List[Path]: List of all paths to relevant text files + + Raises: + FileNotFoundError: No text files exist in the specified root path. + """ + txt_files = sorted(self.root_path.rglob("*.txt")) + + if not txt_files: + raise FileNotFoundError( + f"No .txt files found under dataset root: {self.root_path.root}" + ) + + return txt_files + + def _infer_metadata_from_path(self, file_path: Path) -> Dict[str, str]: + """ + Infer activity, subject, and segment identifiers from the file path. + + Args: + file_path (Path): The path to the given text file. + + Returns: + Dict[str, str]: Map from metadata name to value for record, activity, + patient, and segment IDs + + Raises: + ValueError: Folder structure does not follow aXX/p*/sXX.txt. + ValueError: Activity folder name doesn't start with a. + ValueError: Patient folder name doesn't start with p. + ValueError: Segment file name doesn't start with s. + """ + relative_parts = file_path.relative_to(self.root_path).parts + + if len(relative_parts) < 3: + raise ValueError( + f"Unexpected file structure for {file_path}. " + f"Expected at least activity/subject/file.txt" + ) + + activity_id = relative_parts[-3] + patient_id = relative_parts[-2] + segment_id = file_path.stem + record_id = f"{patient_id}_{activity_id}_{segment_id}" + + if not activity_id.startswith("a"): + raise ValueError(f"Invalid activity folder name: {activity_id}") + if not patient_id.startswith("p"): + raise ValueError(f"Invalid subject folder name: {patient_id}") + if not segment_id.startswith("s"): + raise ValueError(f"Invalid segment filename: {segment_id}") + + return { + "record_id": record_id, + "activity_id": activity_id, + "patient_id": patient_id, + "segment_id": segment_id, + } + + def load_signal(self, file_path: str | Path) -> np.ndarray: + """ + Load and validate a 125 x 45 sensor matrix from a text file. + + Args: + file_path (str | Path): The path to the text file to load. + + Returns: + np.ndarray: The value loaded from the text file of the sensor. + + Raises: + ValueError: Couldn't read numeric data from the file. + ValueError: Empty target file. + ValueError: Shape of parsed data is not 2D. + ValueError: Shape of parsed data is not (125, 45). + ValueError: Parsed data contains NaN or Inf values. + """ + file_path = Path(file_path) + + try: + signal = np.loadtxt(file_path, delimiter=",", dtype=np.float32) + except Exception as e: + raise ValueError( + f"Failed to parse numeric data from {file_path}: {e}" + ) from e + + if signal.size == 0: + raise ValueError(f"Empty signal file: {file_path}") + + if signal.ndim == 1: + signal = np.expand_dims(signal, axis=1) + + if signal.ndim != 2: + raise ValueError( + f"""Signal in {file_path} must be 2D after parsing, got shape + {signal.shape}""" + ) + + if signal.shape != (125, 45): + raise ValueError( + f"Signal in {file_path} must have shape (125, 45), got {signal.shape}" + ) + + if not np.isfinite(signal).all(): + raise ValueError(f"Signal contains NaN or Inf values: {file_path}") + + return signal + + def _get_activity_name(self, activity_id: str) -> str: + """ + Get activity name from a given ID. + + Args: + activity_id (str): The ID number XX from the folder "root/aXX/...". + + Returns: + str: The corresponding activity name from activities[XX - 1] + + Raises: + ValueError: Activity ID is not within 1 to 19 inclusive. + """ + idx = int(activity_id[1:]) - 1 + if idx < 0 or idx >= len(self.activities): + raise ValueError(f"Invalid activity_id: {activity_id}") + return self.activities[idx] + + def _parse_file_to_event_row(self, file_path: Path) -> Dict[str, Any]: + """ + Reads a target file and folder structure and parses it into an event row. + + Args: + file_path (Path): Path to target dataset file. + + Returns: + Dict[str, Any]: Maps attribute name to value from parsed data + """ + metadata = self._infer_metadata_from_path(file_path) + signal = self.load_signal(file_path) + + activity_name = self._get_activity_name(metadata["activity_id"]) + + return { + "patient_id": metadata["patient_id"], + "event_type": "daily_sport_activities", + "timestamp": pd.NaT, + "daily_sport_activities/record_id": metadata["record_id"], + "daily_sport_activities/visit_id": metadata["segment_id"], + "daily_sport_activities/activity_id": metadata["activity_id"], + "daily_sport_activities/activity": activity_name, + "daily_sport_activities/segment_id": metadata["segment_id"], + "daily_sport_activities/file_path": str(file_path), + "daily_sport_activities/n_rows": int(signal.shape[0]), + "daily_sport_activities/n_cols": int(signal.shape[1]), + "daily_sport_activities/sampling_rate_hz": 25, + "daily_sport_activities/duration_seconds": 5, + } + + def load_data(self) -> dd.DataFrame: + """ + Load raw segment files into a PyHealth-compatible event dataframe. + + Returns: + dd.DataFrame: Dask dataframe of event rows from parsed data. + + Raises: + ValueError: No valid parsed data exists. + """ + rows: List[Dict[str, Any]] = [] + txt_files = self._discover_files() + + for file_path in txt_files: + rows.append(self._parse_file_to_event_row(file_path)) + + if not rows: + raise ValueError("No samples were parsed from the dataset.") + + pdf = pd.DataFrame(rows) + pdf["patient_id"] = pdf["patient_id"].astype("string") + pdf["event_type"] = pdf["event_type"].astype("string") + pdf["timestamp"] = pd.NaT + + return dd.from_pandas(pdf, npartitions=1) diff --git a/pyhealth/tasks/__init__.py b/pyhealth/tasks/__init__.py index a32618f9c..8392d9131 100644 --- a/pyhealth/tasks/__init__.py +++ b/pyhealth/tasks/__init__.py @@ -67,3 +67,4 @@ VariantClassificationClinVar, ) from .patient_linkage_mimic3 import PatientLinkageMIMIC3Task +from .daily_sport_activities import DailyAndSportActivitiesClassification diff --git a/pyhealth/tasks/daily_sport_activities.py b/pyhealth/tasks/daily_sport_activities.py new file mode 100644 index 000000000..4cbe616ad --- /dev/null +++ b/pyhealth/tasks/daily_sport_activities.py @@ -0,0 +1,224 @@ +""" +PyHealth task for classification using the Daily and Sports Activity dataset. + +Dataset link: + https://archive.ics.uci.edu/dataset/256/daily+and+sports+activities + +Dataset paper: + Zhang, H.; Zhan, D.; Lin, Y.; He, J.; Zhu, Q.; Shen, Z.-J.; and + Zheng, Z. 2024. Daily Physical Activity Monitoring: Adaptive Learning + from Multi-source Motion Sensor Data. Proceedings of the fifth Conference + on Health, Inference, and Learning, volume 248 of Proceedings of Machine + Learning Research, 39–54. PMLR + +Dataset paper link: + https://raw.githubusercontent.com/mlresearch/v248/main/assets/zhang24a/zhang24a.pdf + +Authors: + Niam Pattni (npattni2@illinois.edu) + Sezim Zamirbekova (szami2@illinois.edu) +""" +from __future__ import annotations + +from typing import Callable, Dict, List, Optional + +import numpy as np + +from pyhealth.data import Event, Patient +from pyhealth.tasks import BaseTask + + +def _normalize_signal(signal: np.ndarray) -> np.ndarray: + """ + Apply per-feature z-score normalization. + + Args: + signal (np.ndarray): The signal tensor to normalize. + + Returns: + np.ndarray: The normalized signal. + """ + mean = signal.mean(axis=0, keepdims=True) + std = signal.std(axis=0, keepdims=True) + std = np.where(std < 1e-8, 1.0, std) + return (signal - mean) / std + +def _validate_and_select_features( + signal: np.ndarray, + selected_features: Optional[List[int]], +) -> np.ndarray: + """ + Validate optional feature indices and subset the signal. + + Args: + signal (np.ndarray): The original signal before feature selection. + selected_features (Optional[List[int]]): Features to select from the + specified signal. + + Returns: + np.ndarray: The signal after feature selection. + + Raises: + ValueError: selected_features is empty. + ValueError: There is an index in selected_features which is out of bounds + for the signal. + """ + if selected_features is None: + return signal + + if len(selected_features) == 0: + raise ValueError("selected_features cannot be an empty list.") + + n_features = signal.shape[1] + for idx in selected_features: + if idx < 0 or idx >= n_features: + raise ValueError( + f"""Feature index {idx} is out of bounds for signal with {n_features} + features.""" + ) + + return signal[:, selected_features] + +def _sliding_windows( + signal: np.ndarray, + window_size: int, + stride: int, +) -> List[np.ndarray]: + """ + Split a sequence into fixed-size sliding windows. + + Args: + signal (np.ndarray): The signal to split. + window_size (int): The size of the sliding window. + stride (int): How far to slide the window at each step. + + Returns: + List[np.ndarray]: Windows generated from the signal. + + Raises: + ValueError: Window size is negative. + ValueError: Stride is negative. + """ + if window_size <= 0: + raise ValueError(f"window_size must be positive, got {window_size}") + + if stride <= 0: + raise ValueError(f"stride must be positive, got {stride}") + + n_steps = signal.shape[0] + windows: List[np.ndarray] = [] + + if n_steps < window_size: + return windows + + for start in range(0, n_steps - window_size + 1, stride): + end = start + window_size + windows.append(signal[start:end]) + + return windows + +class DailyAndSportActivitiesClassification(BaseTask): + """ + A PyHealth task class for classification of activities in the Daily and + Sport Activities dataset. + + Attributes: + task_name (str): The name of the task. + input_schema (Dict[str, str]): The schema for the task input. + output_schema (Dict[str, str]): The schema for the task output. + + Examples: + >>> from pyhealth.datasets import DailyAndSportActivitiesDataset + >>> from pyhealth.tasks import DailyAndSportActivitiesClassification + >>> dataset = DailyAndSportActivitiesDataset(download=True) + >>> task = DailyAndSportActivitiesyClassification() + >>> samples = dataset.set_task(task) + """ + task_name: str = "DailyAndSportActivitiesClassification" + input_schema: Dict[str, str] = {"signal": "tensor"} + output_schema: Dict[str, str] = {"label": "multiclass"} + + def __init__( + self, + signal_loader: Callable, + window_size: int = 50, + stride: int = 25, + normalize: bool = True, + selected_features: Optional[List[int]] = None, + ) -> None: + """ + Initializes the DailyAndSportActivitiesClassification task. + + Args: + signal_loader (Callable): The function to use for parsing signal data. + window_size (int): The size of the sliding window on the input signal. + Defaults to 50. + stride (int): The size of the sliding window move. Defauts to 25. + normalize (bool): Should the signal data be normalized. Defaults to True. + selected_features (Optional[List[int]]): Features to select from the signal. + Defaults to None (all features). + + Raises: + ValueError: Window size is negative. + ValueError: Stride is negative. + """ + if window_size <= 0: + raise ValueError(f"window_size must be positive, got {window_size}") + if stride <= 0: + raise ValueError(f"stride must be positive, got {stride}") + + self.signal_loader = signal_loader + self.window_size = window_size + self.stride = stride + self.normalize = normalize + self.selected_features = selected_features + + def __call__(self, patient: Patient) -> List[Dict]: + """ + Generate activity-recognition samples for a single patient. + + Args: + patient (Patient): The patient to generate samples for. + + Returns: + List[Dict]: The list of samples for the specified patient. + """ + events: List[Event] = patient.get_events(event_type="daily_sport_activities") + + samples: List[Dict] = [] + + for event in events: + signal = self.signal_loader(event["file_path"]) + signal = _validate_and_select_features(signal, self.selected_features) + + if self.normalize: + signal = _normalize_signal(signal) + + windows = _sliding_windows( + signal=signal, + window_size=self.window_size, + stride=self.stride, + ) + + activity_id = event["activity_id"] + if isinstance(activity_id, str) and activity_id.startswith("a"): + label = int(activity_id[1:]) - 1 + else: + label = int(activity_id) + + record_id = event["record_id"] + visit_id = event["visit_id"] + + for idx, window in enumerate(windows): + samples.append( + { + "patient_id": patient.patient_id, + "visit_id": visit_id, + "record_id": f"{record_id}_win_{idx}", + "signal": window.astype(np.float32), + "label": label, + } + ) + + return samples + \ No newline at end of file diff --git a/tests/core/test_daily_sport_activities.py b/tests/core/test_daily_sport_activities.py new file mode 100644 index 000000000..7e7951c1b --- /dev/null +++ b/tests/core/test_daily_sport_activities.py @@ -0,0 +1,260 @@ +""" +Unit tests for the DailyAndSportActivitiesDataset and +DailyAndSportActivitiesClassification classes. + +Authors: + Niam Pattni (npattni2@illinois.edu) + Sezim Zamirbekova (szami2@illinois.edu) +""" +from pathlib import Path +import shutil +import unittest + +import numpy as np +import pandas as pd +import torch + +from pyhealth.datasets import DailyAndSportActivitiesDataset +from pyhealth.tasks import DailyAndSportActivitiesClassification + +class TestDailyAndSportActivityDataset(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.root = ( + Path(__file__).parent.parent.parent + / "test-resources" + / "core" + / "daily-sport-activities" + ) + cls.generate_fake_dataset() + cls.dataset = DailyAndSportActivitiesDataset(cls.root) + cls.samples = cls.dataset.set_task(cls.dataset.default_task) + + @classmethod + def tearDownClass(cls): + cls.samples.close() + for activity_dir in cls.root.glob("a*"): + if activity_dir.is_dir(): + shutil.rmtree(activity_dir) + + @classmethod + def generate_fake_dataset(cls): + """ + Creates a tiny synthetic dataset with the same folder structure as the real one. + + Structure: + root/ + a01/ + p1/ + s01.txt + s02.txt + p7/ + s01.txt + p8/ + s01.txt + a02/ + p1/ + s01.txt + p7/ + s01.txt + p8/ + s01.txt + + Args: + root (Path): The root directory to create the dummy folder structure. + """ + files = [ + ("a01", "p1", "s01.txt"), + ("a01", "p1", "s02.txt"), + ("a01", "p7", "s01.txt"), + ("a01", "p8", "s01.txt"), + ("a02", "p1", "s01.txt"), + ("a02", "p7", "s01.txt"), + ("a02", "p8", "s01.txt"), + ] + + rng = np.random.default_rng(123) + + for activity, subject, segment in files: + path = cls.root / activity / subject / segment + path.parent.mkdir(parents=True, exist_ok=True) + data = rng.normal(size=(125, 45)).astype(np.float32) + np.savetxt(path, data, delimiter=",", fmt="%.6f") + + def test_stats(self): + self.dataset.stats() + + def test_num_patients(self): + self.assertEqual(len(self.dataset.unique_patient_ids), 3) + + def test_load_signal_returns_expected_shape_and_dtype(self): + file_path = self.root / "a01" / "p1" / "s01.txt" + signal = self.dataset.load_signal(file_path) + + self.assertEqual(signal.shape, (125, 45)) + self.assertEqual(signal.dtype, np.float32) + + def test_load_signal_invalid_shape_raises(self): + bad_path = self.root / "a03" / "p1" / "s01.txt" + bad_path.parent.mkdir(parents=True, exist_ok=True) + + bad_signal = np.random.randn(124, 45).astype(np.float32) + np.savetxt(bad_path, bad_signal, delimiter=",", fmt="%.6f") + + with self.assertRaises(ValueError): + self.dataset.load_signal(bad_path) + + bad_path.unlink() + bad_path.parent.rmdir() + bad_path.parent.parent.rmdir() + + def test_load_data_returns_event_dataframe(self): + df = self.dataset.load_data().compute() + + self.assertEqual(len(df), 7) + self.assertIn("patient_id", df.columns) + self.assertIn("event_type", df.columns) + self.assertIn("timestamp", df.columns) + self.assertIn("daily_sport_activities/record_id", df.columns) + self.assertIn("daily_sport_activities/visit_id", df.columns) + self.assertIn("daily_sport_activities/activity_id", df.columns) + self.assertIn("daily_sport_activities/activity", df.columns) + self.assertIn("daily_sport_activities/file_path", df.columns) + self.assertIn("daily_sport_activities/n_rows", df.columns) + self.assertIn("daily_sport_activities/n_cols", df.columns) + + self.assertEqual(set(df["event_type"].unique()), {"daily_sport_activities"}) + + def test_get_patient_p1(self): + events = self.dataset.get_patient("p1").get_events( + event_type="daily_sport_activities" + ) + + self.assertEqual(len(events), 3) + + self.assertEqual(events[0]["visit_id"], "s01") + self.assertEqual(events[1]["visit_id"], "s02") + self.assertIn(events[2]["activity"], {"sitting", "standing"}) + + def test_get_patient_p7(self): + events = self.dataset.get_patient("p7").get_events( + event_type="daily_sport_activities" + ) + + self.assertEqual(len(events), 2) + self.assertEqual(set(event["activity_id"] for event in events), {"a01", "a02"}) + + def test_get_patient_p8(self): + events = self.dataset.get_patient("p8").get_events( + event_type="daily_sport_activities" + ) + + self.assertEqual(len(events), 2) + self.assertEqual(set(event["activity_id"] for event in events), {"a01", "a02"}) + + def test_event_metadata_is_parsed_correctly(self): + df = self.dataset.load_data().compute() + row = df.iloc[0] + + self.assertIn(row["patient_id"], {"p1", "p7", "p8"}) + self.assertIn(row["daily_sport_activities/activity_id"], {"a01", "a02"}) + self.assertIn(row["daily_sport_activities/activity"], {"sitting", "standing"}) + self.assertIn(row["daily_sport_activities/visit_id"], {"s01", "s02"}) + self.assertEqual(row["daily_sport_activities/n_rows"], 125) + self.assertEqual(row["daily_sport_activities/n_cols"], 45) + self.assertEqual(row["daily_sport_activities/sampling_rate_hz"], 25) + self.assertEqual(row["daily_sport_activities/duration_seconds"], 5) + + def test_default_task(self): + self.assertIsInstance( + self.dataset.default_task, + DailyAndSportActivitiesClassification, + ) + + def test_task_generates_expected_number_of_samples(self): + self.assertEqual(len(self.samples), 28) + + def test_task_sample_structure(self): + sample = self.samples[0] + + self.assertIn("patient_id", sample) + self.assertIn("visit_id", sample) + self.assertIn("record_id", sample) + self.assertIn("signal", sample) + self.assertIn("label", sample) + + self.assertEqual(sample["signal"].shape, (50, 45)) + self.assertEqual(sample["signal"].dtype, torch.float32) + self.assertIsInstance(sample["label"], torch.Tensor) + self.assertEqual(sample["label"].dtype, torch.int64) + + def test_task_labels_are_correct(self): + labels = [sample["label"] for sample in self.samples] + self.assertEqual(labels.count(0), 16) + self.assertEqual(labels.count(1), 12) + + def test_task_selected_features_reduces_dimension(self): + samples = self.dataset.set_task( + DailyAndSportActivitiesClassification( + signal_loader=self.dataset.load_signal, + window_size=50, + stride=25, + selected_features=[0, 1, 2, 3], + ) + ) + + self.assertGreater(len(samples), 0) + self.assertEqual(samples[0]["signal"].shape, (50, 4)) + samples.close() + + def test_task_invalid_feature_index_raises(self): + with self.assertRaises(ValueError): + self.dataset.set_task( + DailyAndSportActivitiesClassification( + signal_loader=self.dataset.load_signal, + window_size=50, + stride=25, + selected_features=[999], + ) + ) + + def test_task_empty_selected_features_raises(self): + with self.assertRaises(ValueError): + self.dataset.set_task( + DailyAndSportActivitiesClassification( + signal_loader=self.dataset.load_signal, + window_size=50, + stride=25, + selected_features=[], + ) + ) + + def test_task_invalid_window_size_raises(self): + with self.assertRaises(ValueError): + DailyAndSportActivitiesClassification( + signal_loader=self.dataset.load_signal, + window_size=0, + stride=25, + ) + + def test_task_invalid_stride_raises(self): + with self.assertRaises(ValueError): + DailyAndSportActivitiesClassification( + signal_loader=self.dataset.load_signal, + window_size=50, + stride=0, + ) + + def test_task_window_larger_than_signal_raises(self): + with self.assertRaises(ValueError): + self.dataset.set_task( + DailyAndSportActivitiesClassification( + signal_loader=self.dataset.load_signal, + window_size=200, + stride=25, + ) + ) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file