From 6d235702d248eb0fafedf2f4ff6f9861dee7c170 Mon Sep 17 00:00:00 2001 From: Niam Pattni Date: Mon, 20 Apr 2026 18:07:54 -0400 Subject: [PATCH 1/5] Daily and Sport Activities Dataset + Task Implementation --- docs/api/datasets.rst | 1 + ...ealth.datasets.DailyAndSportActivities.rst | 9 + docs/api/tasks.rst | 1 + ...pyhealth.tasks.DailyAndSportActivities.rst | 7 + examples/daily_sport_activities.ipynb | 388 ++++++++++++++++++ pyhealth/datasets/__init__.py | 1 + .../configs/daily_sport_activities.yaml | 18 + pyhealth/datasets/daily_sport_activities.py | 249 +++++++++++ pyhealth/tasks/__init__.py | 1 + pyhealth/tasks/daily_sport_activities.py | 122 ++++++ tests/core/test_daily_sport_activities.py | 228 ++++++++++ 11 files changed, 1025 insertions(+) create mode 100644 docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst create mode 100644 docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst create mode 100644 examples/daily_sport_activities.ipynb create mode 100644 pyhealth/datasets/configs/daily_sport_activities.yaml create mode 100644 pyhealth/datasets/daily_sport_activities.py create mode 100644 pyhealth/tasks/daily_sport_activities.py create mode 100644 tests/core/test_daily_sport_activities.py diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst index 8d9a59d21..96cdab284 100644 --- a/docs/api/datasets.rst +++ b/docs/api/datasets.rst @@ -244,5 +244,6 @@ Available Datasets datasets/pyhealth.datasets.ClinVarDataset datasets/pyhealth.datasets.COSMICDataset datasets/pyhealth.datasets.TCGAPRADDataset + datasetes/pyhealth.datasets.DailyAndSportActivities datasets/pyhealth.datasets.splitter datasets/pyhealth.datasets.utils diff --git a/docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst b/docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst new file mode 100644 index 000000000..46b7ebf3c --- /dev/null +++ b/docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst @@ -0,0 +1,9 @@ +pyhealth.datasets.daily_sport_activities +======================================== + +The Daily and Sport Activities dataset. For more information see `here `_. + +.. autoclass: pyhealth.datasets.daily_sport_activities.DailyAndSportActivitiesDataset + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/api/tasks.rst b/docs/api/tasks.rst index 23a4e06e5..115cf6667 100644 --- a/docs/api/tasks.rst +++ b/docs/api/tasks.rst @@ -230,3 +230,4 @@ Available Tasks Mutation Pathogenicity (COSMIC) Cancer Survival Prediction (TCGA) Cancer Mutation Burden (TCGA) + Daily and Sport Activities diff --git a/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst b/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst new file mode 100644 index 000000000..6db8a5bde --- /dev/null +++ b/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst @@ -0,0 +1,7 @@ +pyhealth.tasks.daily_sport_activities +=============================================== + +.. autoclass:: pyhealth.tasks.daily_sport_activities.DailyAndSportActivitiesTask + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/examples/daily_sport_activities.ipynb b/examples/daily_sport_activities.ipynb new file mode 100644 index 000000000..1b546d318 --- /dev/null +++ b/examples/daily_sport_activities.ipynb @@ -0,0 +1,388 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3d4f2a5e", + "metadata": {}, + "source": [ + "# DailySportActivities RNN Ablation Study\n", + "\n", + "This notebook demonstrates an ablation study for the Daily and Sports Activities dataset using a built-in PyHealth RNN model.\n", + "\n", + "## Goal\n", + "Evaluate how task configuration affects downstream multiclass activity recognition performance.\n", + "\n", + "## Ablation configurations\n", + "1. `window_size=25, stride=10, normalize=True`\n", + "2. `window_size=50, stride=25, normalize=True`\n", + "3. `window_size=50, stride=25, normalize=False`\n", + "\n", + "## Workflow\n", + "- Build a small synthetic dataset in the real folder format: `aXX/pY/sZZ.txt`\n", + "- Create PyHealth task datasets under multiple task settings\n", + "- Train/evaluate a built-in PyHealth RNN\n", + "- Compare test accuracy across configurations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceac2c74", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import tempfile\n", + "import random\n", + "\n", + "import numpy as np\n", + "\n", + "from pyhealth.datasets.daily_sport_activities import DailyAndSportActivitiesDataset\n", + "from pyhealth.tasks.daily_sport_activities import DailyAndSportActivitiesTask\n", + "from pyhealth.datasets import split_by_patient, get_dataloader\n", + "from pyhealth.models import RNN\n", + "from pyhealth.trainer import Trainer\n", + "\n", + "random.seed(42)\n", + "np.random.seed(42)" + ] + }, + { + "cell_type": "markdown", + "id": "1c74bc91", + "metadata": {}, + "source": [ + "## Synthetic data generation\n", + "\n", + "We create synthetic segment files in the same folder structure as the real dataset:\n", + "\n", + "- `a01`, `a02`, `a03` for activities\n", + "- `p1`, `p2`, ... for subjects\n", + "- `s01.txt`, `s02.txt`, ... for segments\n", + "\n", + "Each file is a `125 x 45` matrix:\n", + "- 125 rows = 5 seconds × 25 Hz\n", + "- 45 columns = 5 body units × 9 sensor axes\n", + "\n", + "We inject class-specific structure so the model has a signal to learn." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ceae0a10", + "metadata": {}, + "outputs": [], + "source": [ + "def _write_fake_signal_file(\n", + " file_path: Path,\n", + " activity_idx: int,\n", + " shape=(125, 45),\n", + " seed: int = 0,\n", + ") -> None:\n", + " rng = np.random.default_rng(seed)\n", + " signal = rng.normal(loc=0.0, scale=1.0, size=shape).astype(np.float32)\n", + "\n", + " # Inject class-dependent structure\n", + " start_col = activity_idx * 3\n", + " end_col = min(start_col + 3, shape[1])\n", + " signal[:, start_col:end_col] += 2.5\n", + "\n", + " # Add a mild temporal trend by class\n", + " t = np.linspace(0, 1, shape[0], dtype=np.float32)[:, None]\n", + " signal[:, start_col:end_col] += (activity_idx + 1) * 0.5 * t\n", + "\n", + " file_path.parent.mkdir(parents=True, exist_ok=True)\n", + " np.savetxt(file_path, signal, delimiter=\",\", fmt=\"%.6f\")\n", + "\n", + "\n", + "def build_synthetic_dataset(root: Path) -> None:\n", + " activities = [\"a01\", \"a02\", \"a03\"]\n", + " subjects = [f\"p{i}\" for i in range(1, 7)]\n", + " segments_per_subject = 4\n", + "\n", + " seed = 0\n", + " for activity_idx, activity in enumerate(activities):\n", + " for subject in subjects:\n", + " for seg in range(1, segments_per_subject + 1):\n", + " file_path = root / activity / subject / f\"s{seg:02d}.txt\"\n", + " _write_fake_signal_file(\n", + " file_path=file_path,\n", + " activity_idx=activity_idx,\n", + " seed=seed,\n", + " )\n", + " seed += 1" + ] + }, + { + "cell_type": "markdown", + "id": "6df68140", + "metadata": {}, + "source": [ + "## Build the synthetic dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3b023a0", + "metadata": {}, + "outputs": [], + "source": [ + "tmpdir = tempfile.TemporaryDirectory()\n", + "root = Path(tmpdir.name) / \"daily_sport_activities\"\n", + "build_synthetic_dataset(root)\n", + "\n", + "root" + ] + }, + { + "cell_type": "markdown", + "id": "2cbeacf8", + "metadata": {}, + "source": [ + "## Quick dataset sanity check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "947fb475", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = DailyAndSportActivitiesDataset(root=str(root))\n", + "raw_samples = dataset.parse_data()\n", + "\n", + "len(raw_samples), raw_samples[0][\"signal\"].shape, raw_samples[0][\"activity\"]" + ] + }, + { + "cell_type": "markdown", + "id": "08e695fe", + "metadata": {}, + "source": [ + "## Define one ablation run\n", + "\n", + "For each configuration:\n", + "- build a task-specific sample dataset\n", + "- split by patient\n", + "- create dataloaders\n", + "- train a built-in PyHealth RNN\n", + "- evaluate on the test set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "227158ea", + "metadata": {}, + "outputs": [], + "source": [ + "def run_one_config(root: Path, cfg: dict) -> dict:\n", + " dataset = DailyAndSportActivitiesDataset(root=str(root))\n", + " task = DailyAndSportActivitiesTask(**cfg, signal_loader=dataset.load_signal)\n", + "\n", + " sample_dataset = dataset.set_task(task)\n", + "\n", + " train_ds, val_ds, test_ds = split_by_patient(sample_dataset, [0.6, 0.2, 0.2])\n", + "\n", + " train_loader = get_dataloader(train_ds, batch_size=16, shuffle=True)\n", + " val_loader = get_dataloader(val_ds, batch_size=16, shuffle=False)\n", + " test_loader = get_dataloader(test_ds, batch_size=16, shuffle=False)\n", + "\n", + " model = RNN(\n", + " dataset=sample_dataset,\n", + " embedding_dim=128,\n", + " hidden_dim=64,\n", + " rnn_type=\"GRU\",\n", + " num_layers=1,\n", + " dropout=0.1,\n", + " )\n", + "\n", + " trainer = Trainer(\n", + " model=model,\n", + " metrics=[\"accuracy\"],\n", + " device=\"cpu\",\n", + " enable_logging=False,\n", + " )\n", + "\n", + " trainer.train(\n", + " train_dataloader=train_loader,\n", + " val_dataloader=val_loader,\n", + " epochs=5,\n", + " monitor=\"accuracy\",\n", + " monitor_criterion=\"max\",\n", + " )\n", + "\n", + " scores = trainer.evaluate(test_loader)\n", + "\n", + " return {\n", + " \"config\": cfg,\n", + " \"n_total_samples\": len(sample_dataset),\n", + " \"n_train\": len(train_ds),\n", + " \"n_val\": len(val_ds),\n", + " \"n_test\": len(test_ds),\n", + " \"accuracy\": float(scores[\"accuracy\"]),\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "aa33d665", + "metadata": {}, + "source": [ + "## Define ablation settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fca5285", + "metadata": {}, + "outputs": [], + "source": [ + "configs = [\n", + " {\"window_size\": 25, \"stride\": 10, \"normalize\": True},\n", + " {\"window_size\": 50, \"stride\": 25, \"normalize\": True},\n", + " {\"window_size\": 50, \"stride\": 25, \"normalize\": False},\n", + "]\n", + "\n", + "configs" + ] + }, + { + "cell_type": "markdown", + "id": "8a3d3843", + "metadata": {}, + "source": [ + "## Run the ablation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "311188e5-b47e-40a9-bcba-71ea1677dcb6", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = DailyAndSportActivitiesDataset(root=str(root))\n", + "patient = dataset.get_patient(\"p1\")\n", + "event = patient.get_events(event_type=\"daily_sport_activities\")[0]\n", + "print(event)\n", + "print(event.attr_dict.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c5784e2", + "metadata": {}, + "outputs": [], + "source": [ + "results = []\n", + "\n", + "for cfg in configs:\n", + " print(f\"Running config: {cfg}\")\n", + " result = run_one_config(root, cfg)\n", + " results.append(result)\n", + " print(result)\n", + " print(\"-\" * 80)" + ] + }, + { + "cell_type": "markdown", + "id": "d36015e3", + "metadata": {}, + "source": [ + "## Summarize results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df67cb3d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "pd.set_option(\"display.max_colwidth\", None)\n", + "\n", + "results_df = pd.DataFrame(results)\n", + "results_df" + ] + }, + { + "cell_type": "markdown", + "id": "3d194c27", + "metadata": {}, + "source": [ + "## Sort by accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89fe3ef7", + "metadata": {}, + "outputs": [], + "source": [ + "results_df.sort_values(\"accuracy\", ascending=False).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b5a3c92", + "metadata": {}, + "outputs": [], + "source": [ + "best_row = results_df.sort_values(\"accuracy\", ascending=False).iloc[0]\n", + "\n", + "print(\"Best configuration:\")\n", + "print(best_row[\"config\"])\n", + "print(f\"Accuracy: {best_row['accuracy']:.4f}\")\n", + "print(f\"Total task samples: {best_row['n_total_samples']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a4b2cfc", + "metadata": {}, + "outputs": [], + "source": [ + "tmpdir.cleanup()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44dcdd98-17c9-457b-95b8-8e9b5c6e1c93", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py index 50b1b3887..7dd25a4c8 100644 --- a/pyhealth/datasets/__init__.py +++ b/pyhealth/datasets/__init__.py @@ -68,6 +68,7 @@ def __init__(self, *args, **kwargs): from .bmd_hs import BMDHSDataset from .support2 import Support2Dataset from .tcga_prad import TCGAPRADDataset +from .daily_sport_activities import DailyAndSportActivitiesDataset from .splitter import ( sample_balanced, split_by_patient, diff --git a/pyhealth/datasets/configs/daily_sport_activities.yaml b/pyhealth/datasets/configs/daily_sport_activities.yaml new file mode 100644 index 000000000..e374552fc --- /dev/null +++ b/pyhealth/datasets/configs/daily_sport_activities.yaml @@ -0,0 +1,18 @@ +# Author: + +version: "1.0" + +tables: + daily_sport_activities: + file_path: "" + patient_id: "patient_id" + timestamp: null + attributes: + - "record_id" + - "patient_id" + - "visit_id" + - "activity_id" + - "activity" + - "segment" + - "file_path" + - "signal" \ No newline at end of file diff --git a/pyhealth/datasets/daily_sport_activities.py b/pyhealth/datasets/daily_sport_activities.py new file mode 100644 index 000000000..ef390c03a --- /dev/null +++ b/pyhealth/datasets/daily_sport_activities.py @@ -0,0 +1,249 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Dict, List, Optional, Any + +import io +import zipfile + +import dask.dataframe as dd +import numpy as np +import pandas as pd +import requests + +from pyhealth.datasets import BaseDataset + + +class DailyAndSportActivitiesDataset(BaseDataset): + """PyHealth dataset for Daily and Sports Activities data. + + This dataset parses multi-sensor time-series text files into structured + samples suitable for downstream task processing. + + Expected folder layout example: + root/ + activity_01/ + subject_01/ + segment_01.txt + segment_02.txt + activity_02/ + subject_01/ + segment_01.txt + + Each .txt file is expected to contain numeric sensor values arranged + row-wise over time. + + Parsed sample format: + { + "record_id": str, + "patient_id": str, + "activity": str, + "activity_id": int, + "segment_id": str, + "signal": np.ndarray, # shape: [time_steps, num_features] + } + """ + + activities: List[str] = ["sitting", "standing", "lying on back", "lying on right side", + "ascending stairs", "descending stairs", "standing still in elevator", + "moving around in elevator", "walking in parking lot", + "walking on 4km/h treadmill 0 incline", "walking on 4km/h treadmill 15 incline", + "running on 8km/h treadmill", "using stair stepper", "using cross trainer", + "cycling in horizontal position", "cycling in vertical position", + "rowing", "jumping", "playing basketball"] + + def __init__( + self, + root: str = ".", + config_path: Optional[str] = str(Path(__file__).parent / "configs" / "daily_sport_activities.yaml"), + download: bool = False, + dev: bool = False, + ): + self.root_path = Path(root) + self.root = root + + if download: + self._download(self.root) + + if not self.root_path.exists(): + raise FileNotFoundError(f"Dataset root does not exist: {self.root_path}") + + if not self.root_path.is_dir(): + raise NotADirectoryError(f"Dataset root is not a directory: {self.root_path}") + + super().__init__( + root=self.root, + tables=["daily_sport_activities"], + dataset_name="daily_sport_activities", + config_path=config_path, + dev=dev, + ) + + @property + def default_task(self): + from pyhealth.tasks.daily_sport_activities import DailyAndSportActivitiesTask + return DailyAndSportActivitiesTask(signal_loader=self.load_signal) + + def _download(self, root: str) -> None: + """Downloads the Daily and Sports Activities dataset and extracts the compressed data. + + Args: + root (str): Root directory of raw data. + Raises: + HTTPError: If the file cannot be downloaded. + """ + + dataset_url = "https://archive.ics.uci.edu/static/public/256/daily+and+sports+activities.zip" + + root_path = Path(root) + root_path.mkdir(parents=True, exist_ok=True) + + response = requests.get(dataset_url, timeout=60) + response.raise_for_status() + + with zipfile.ZipFile(io.BytesIO(response.content)) as zf: + zf.extractall(root_path) + + def _discover_files(self) -> List[Path]: + """Find all text files under the dataset root.""" + txt_files = sorted(self.root_path.rglob("*.txt")) + + if not txt_files: + raise FileNotFoundError( + f"No .txt files found under dataset root: {root_path.root}" + ) + + return txt_files + + def _infer_metadata_from_path(self, file_path: Path) -> Dict[str, str]: + """Infer activity, subject, and segment identifiers from the file path.""" + relative_parts = file_path.relative_to(self.root_path).parts + + if len(relative_parts) < 3: + raise ValueError( + f"Unexpected file structure for {file_path}. " + f"Expected at least activity/subject/file.txt" + ) + + activity_id = relative_parts[-3] + patient_id = relative_parts[-2] + segment_id = file_path.stem + record_id = f"{patient_id}_{activity_id}_{segment_id}" + + if not activity_id.startswith("a"): + raise ValueError(f"Invalid activity folder name: {activity_id}") + if not patient_id.startswith("p"): + raise ValueError(f"Invalid subject folder name: {patient_id}") + if not segment_id.startswith("s"): + raise ValueError(f"Invalid segment filename: {segment_id}") + + return { + "record_id": record_id, + "activity_id": activity_id, + "patient_id": patient_id, + "segment_id": segment_id, + } + + def load_signal(self, file_path: str | Path) -> np.ndarray: + """Load and validate a 125 x 45 sensor matrix from a text file.""" + file_path = Path(file_path) + + try: + signal = np.loadtxt(file_path, delimiter=",", dtype=np.float32) + except Exception as e: + raise ValueError(f"Failed to parse numeric data from {file_path}: {e}") from e + + if signal.size == 0: + raise ValueError(f"Empty signal file: {file_path}") + + if signal.ndim == 1: + signal = np.expand_dims(signal, axis=1) + + if signal.ndim != 2: + raise ValueError( + f"Signal in {file_path} must be 2D after parsing, got shape {signal.shape}" + ) + + if signal.shape != (125, 45): + raise ValueError( + f"Signal in {file_path} must have shape (125, 45), got {signal.shape}" + ) + + if not np.isfinite(signal).all(): + raise ValueError(f"Signal contains NaN or Inf values: {file_path}") + + return signal + + def _get_activity_name(self, activity_id: str) -> str: + idx = int(activity_id[1:]) - 1 + if idx < 0 or idx >= len(self.activities): + raise ValueError(f"Invalid activity_id: {activity_id}") + return self.activities[idx] + + def _parse_file_to_event_row(self, file_path: Path) -> Dict[str, Any]: + metadata = self._infer_metadata_from_path(file_path) + signal = self.load_signal(file_path) + + activity_name = self._get_activity_name(metadata["activity_id"]) + + return { + "patient_id": metadata["patient_id"], + "event_type": "daily_sport_activities", + "timestamp": pd.NaT, + "daily_sport_activities/record_id": metadata["record_id"], + "daily_sport_activities/visit_id": metadata["segment_id"], + "daily_sport_activities/activity_id": metadata["activity_id"], + "daily_sport_activities/activity": activity_name, + "daily_sport_activities/segment_id": metadata["segment_id"], + "daily_sport_activities/file_path": str(file_path), + "daily_sport_activities/n_rows": int(signal.shape[0]), + "daily_sport_activities/n_cols": int(signal.shape[1]), + "daily_sport_activities/sampling_rate_hz": 25, + "daily_sport_activities/duration_seconds": 5, + } + + def load_data(self) -> dd.DataFrame: + """Load raw segment files into a PyHealth-compatible event dataframe.""" + rows: List[Dict[str, Any]] = [] + txt_files = self._discover_files() + + for file_path in txt_files: + rows.append(self._parse_file_to_event_row(file_path)) + + if not rows: + raise ValueError("No samples were parsed from the dataset.") + + pdf = pd.DataFrame(rows) + pdf["patient_id"] = pdf["patient_id"].astype("string") + pdf["event_type"] = pdf["event_type"].astype("string") + pdf["timestamp"] = pd.NaT + + return dd.from_pandas(pdf, npartitions=1) + + def parse_data(self) -> List[Dict]: + """Debug helper: parse raw files into in-memory samples.""" + samples: List[Dict[str, Any]] = [] + txt_files = self._discover_files() + + for file_path in txt_files: + metadata = self._infer_metadata_from_path(file_path) + signal = self.load_signal(file_path) + activity_name = self._get_activity_name(metadata["activity_id"]) + + samples.append( + { + "record_id": metadata["record_id"], + "patient_id": metadata["patient_id"], + "visit_id": metadata["segment_id"], + "activity_id": metadata["activity_id"], + "activity": activity_name, + "segment_id": metadata["segment_id"], + "file_path": str(file_path), + "signal": signal, + } + ) + + if not samples: + raise ValueError("No samples were parsed from the dataset.") + + return samples diff --git a/pyhealth/tasks/__init__.py b/pyhealth/tasks/__init__.py index a32618f9c..4a8889864 100644 --- a/pyhealth/tasks/__init__.py +++ b/pyhealth/tasks/__init__.py @@ -67,3 +67,4 @@ VariantClassificationClinVar, ) from .patient_linkage_mimic3 import PatientLinkageMIMIC3Task +from .daily_sport_activities import DailyAndSportActivitiesTask diff --git a/pyhealth/tasks/daily_sport_activities.py b/pyhealth/tasks/daily_sport_activities.py new file mode 100644 index 000000000..aae9b89a2 --- /dev/null +++ b/pyhealth/tasks/daily_sport_activities.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +from typing import Dict, List, Optional + +import numpy as np + +from pyhealth.data import Event, Patient +from pyhealth.tasks import BaseTask + + +def _normalize_signal(signal: np.ndarray) -> np.ndarray: + """Apply per-feature z-score normalization.""" + mean = signal.mean(axis=0, keepdims=True) + std = signal.std(axis=0, keepdims=True) + std = np.where(std < 1e-8, 1.0, std) + return (signal - mean) / std + +def _validate_and_select_features( + signal: np.ndarray, selected_features: Optional[List[int]] +) -> np.ndarray: + """Validate optional feature indices and subset the signal.""" + if selected_features is None: + return signal + + if len(selected_features) == 0: + raise ValueError("selected_features cannot be an empty list.") + + n_features = signal.shape[1] + for idx in selected_features: + if idx < 0 or idx >= n_features: + raise ValueError( + f"Feature index {idx} is out of bounds for signal with {n_features} features." + ) + + return signal[:, selected_features] + +def _sliding_windows(signal: np.ndarray, window_size: int, stride: int) -> List[np.ndarray]: + """Split a sequence into fixed-size sliding windows.""" + if window_size <= 0: + raise ValueError(f"window_size must be positive, got {window_size}") + + if stride <= 0: + raise ValueError(f"stride must be positive, got {stride}") + + n_steps = signal.shape[0] + windows: List[np.ndarray] = [] + + if n_steps < window_size: + return windows + + for start in range(0, n_steps - window_size + 1, stride): + end = start + window_size + windows.append(signal[start:end]) + + return windows + +class DailyAndSportActivitiesTask(BaseTask): + """Create task samples for activity recognition from DailyAndSportActivitiesDataset.""" + task_name: str = "DailyAndSportActivitiesTask" + input_schema: Dict[str, str] = {"signal": "tensor"} + output_schema: Dict[str, str] = {"label": "multiclass"} + + def __init__( + self, + window_size: int = 50, + stride: int = 25, + normalize: bool = True, + selected_features: Optional[List[int]] = None, + signal_loader=None, + ) -> None: + if window_size <= 0: + raise ValueError(f"window_size must be positive, got {window_size}") + if stride <= 0: + raise ValueError(f"stride must be positive, got {stride}") + + self.window_size = window_size + self.stride = stride + self.normalize = normalize + self.selected_features = selected_features + self.signal_loader = signal_loader + + def __call__(self, patient: Patient) -> List[Dict]: + """Generate activity-recognition samples for one patient.""" + events: List[Event] = patient.get_events(event_type="daily_sport_activities") + + samples: List[Dict] = [] + + for event in events: + signal = self.signal_loader(event["file_path"]) + signal = _validate_and_select_features(signal, self.selected_features) + + if self.normalize: + signal = _normalize_signal(signal) + + windows = _sliding_windows( + signal=signal, + window_size=self.window_size, + stride=self.stride, + ) + + activity_id = event["activity_id"] + if isinstance(activity_id, str) and activity_id.startswith("a"): + label = int(activity_id[1:]) - 1 + else: + label = int(activity_id) + + record_id = event["record_id"] + visit_id = event["visit_id"] + + for idx, window in enumerate(windows): + samples.append( + { + "patient_id": patient.patient_id, + "visit_id": visit_id, + "record_id": f"{record_id}_win_{idx}", + "signal": window.astype(np.float32), + "label": label, + } + ) + + return samples + \ No newline at end of file diff --git a/tests/core/test_daily_sport_activities.py b/tests/core/test_daily_sport_activities.py new file mode 100644 index 000000000..0924be338 --- /dev/null +++ b/tests/core/test_daily_sport_activities.py @@ -0,0 +1,228 @@ +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from pyhealth.datasets.daily_sport_activities import DailyAndSportActivitiesDataset +from pyhealth.tasks.daily_sport_activities import ( + DailyAndSportActivitiesTask, +) + + +def _write_fake_signal_file(file_path: Path, shape=(125, 45), seed: int = 0): + rng = np.random.default_rng(seed) + data = rng.normal(size=shape).astype(np.float32) + + file_path.parent.mkdir(parents=True, exist_ok=True) + pd.DataFrame(data).to_csv( + file_path, + header=False, + index=False, + ) + +def _make_fake_dataset(root: Path): + """ + Creates a tiny synthetic dataset with the same folder structure as the real one. + + Structure: + root/ + a01/ + p1/ + s01.txt + s02.txt + p7/ + s01.txt + p8/ + s01.txt + a02/ + p1/ + s01.txt + p7/ + s01.txt + p8/ + s01.txt + """ + files = [ + ("a01", "p1", "s01.txt"), + ("a01", "p1", "s02.txt"), + ("a01", "p7", "s01.txt"), + ("a01", "p8", "s01.txt"), + ("a02", "p1", "s01.txt"), + ("a02", "p7", "s01.txt"), + ("a02", "p8", "s01.txt"), + ] + + for i, (activity, subject, segment) in enumerate(files): + path = root / activity / subject / segment + _write_fake_signal_file(path, shape=(125, 45), seed=i) + + +def test_parse_data_loads_all_samples(tmp_path): + data_root = tmp_path / "daily_sport_activities" + _make_fake_dataset(data_root) + + dataset = DailyAndSportActivitiesDataset(root=str(data_root)) + samples = dataset.parse_data() + + assert len(samples) == 7 + assert isinstance(samples, list) + + first = samples[0] + assert "record_id" in first + assert "patient_id" in first + assert "visit_id" in first + assert "activity_id" in first + assert "activity" in first + assert "segment_id" in first + assert "file_path" in first + assert "signal" in first + + +def test_signal_shape_is_correct(tmp_path): + data_root = tmp_path / "daily_sport_activities" + _make_fake_dataset(data_root) + + dataset = DailyAndSportActivitiesDataset(root=str(data_root)) + samples = dataset.parse_data() + + for sample in samples: + assert sample["signal"].shape == (125, 45) + assert sample["signal"].dtype == np.float32 + + +def test_invalid_shape_raises_error(tmp_path): + bad_file = tmp_path / "daily_sport_activities" / "a01" / "p1" / "s01.txt" + _write_fake_signal_file(bad_file, shape=(124, 45), seed=123) + + dataset = DailyAndSportActivitiesDataset(root=str(tmp_path / "daily_sport_activities")) + + with pytest.raises(ValueError, match="must have shape"): + dataset.parse_data() + + +def test_missing_root_raises_file_not_found(): + with pytest.raises(FileNotFoundError): + DailyAndSportActivitiesDataset(root="this/path/does/not/exist") + + +def test_load_data_returns_event_dataframe(tmp_path): + data_root = tmp_path / "daily_sport_activities" + _make_fake_dataset(data_root) + + dataset = DailyAndSportActivitiesDataset(root=str(data_root)) + df = dataset.load_data().compute() + + assert len(df) == 7 + assert "patient_id" in df.columns + assert "event_type" in df.columns + assert "timestamp" in df.columns + assert "daily_sport_activities/file_path" in df.columns + assert "daily_sport_activities/activity_id" in df.columns + assert "daily_sport_activities/activity" in df.columns + + assert set(df["event_type"].unique()) == {"daily_sport_activities"} + + +def test_get_patient_returns_expected_events(tmp_path): + data_root = tmp_path / "daily_sport_activities" + _make_fake_dataset(data_root) + + dataset = DailyAndSportActivitiesDataset(root=str(data_root)) + patient = dataset.get_patient("p1") + events = patient.get_events(event_type="daily_sport_activities") + + assert len(events) == 3 + + event = events[0] + assert "daily_sport_activities/file_path" in event + assert "daily_sport_activities/activity_id" in event + assert "daily_sport_activities/activity" in event + assert "daily_sport_activities/visit_id" in event + + +def test_event_metadata_is_parsed_correctly(tmp_path): + data_root = tmp_path / "daily_sport_activities" + _make_fake_dataset(data_root) + + dataset = DailyAndSportActivitiesDataset(root=str(data_root)) + df = dataset.load_data().compute() + + row = df.iloc[0] + + assert row["patient_id"] in {"p1", "p7", "p8"} + assert row["daily_sport_activities/activity_id"] in {"a01", "a02"} + assert row["daily_sport_activities/activity"] in {"sitting", "standing"} + assert row["daily_sport_activities/visit_id"] in {"s01", "s02"} + assert row["daily_sport_activities/n_rows"] == 125 + assert row["daily_sport_activities/n_cols"] == 45 + + +def test_set_task_generates_samples(tmp_path): + data_root = tmp_path / "daily_sport_activities" + _make_fake_dataset(data_root) + + dataset = DailyAndSportActivitiesDataset(root=str(data_root)) + task = DailyAndSportActivitiesTask( + window_size=50, + stride=25, + normalize=True, + ) + samples = dataset.set_task(task) + + assert len(samples) > 0 + + sample = samples[0] + assert "patient_id" in sample + assert "visit_id" in sample + assert "record_id" in sample + assert "signal" in sample + assert "label" in sample + + assert sample["signal"].shape == (50, 45) + assert isinstance(sample["label"], (int, np.integer)) + + +def test_task_selected_features_reduces_dimension(tmp_path): + data_root = tmp_path / "daily_sport_activities" + _make_fake_dataset(data_root) + + dataset = DailyAndSportActivitiesDataset(root=str(data_root)) + task = DailyAndSportActivitiesTask( + window_size=50, + stride=25, + selected_features=[0, 1, 2, 3], + ) + samples = dataset.set_task(task) + + assert len(samples) > 0 + assert samples[0]["signal"].shape == (50, 4) + + +def test_task_invalid_feature_index_raises(tmp_path): + data_root = tmp_path / "daily_sport_activities" + _make_fake_dataset(data_root) + + dataset = DailyAndSportActivitiesDataset(root=str(data_root)) + task = DailyAndSportActivitiesTask( + window_size=50, + stride=25, + selected_features=[999], + ) + + with pytest.raises(ValueError, match="out of bounds"): + dataset.set_task(task) + + +def test_task_window_too_large_raises(tmp_path): + data_root = tmp_path / "daily_sport_activities" + _make_fake_dataset(data_root) + + dataset = DailyAndSportActivitiesDataset(root=str(data_root)) + task = DailyAndSportActivitiesTask( + window_size=200, + stride=25, + ) + + with pytest.raises(ValueError): + dataset.set_task(task) From a8092c7e251b41cbf0bc39ec0a1850601640cbc0 Mon Sep 17 00:00:00 2001 From: Niam Pattni Date: Mon, 20 Apr 2026 21:57:38 -0400 Subject: [PATCH 2/5] Added comments and linted --- ...pyhealth.tasks.DailyAndSportActivities.rst | 2 +- examples/daily_sport_activities.ipynb | 16 +- .../configs/daily_sport_activities.yaml | 4 +- pyhealth/datasets/daily_sport_activities.py | 219 +++++++++++++++--- pyhealth/tasks/__init__.py | 2 +- pyhealth/tasks/daily_sport_activities.py | 127 +++++++++- tests/core/test_daily_sport_activities.py | 39 +++- 7 files changed, 332 insertions(+), 77 deletions(-) diff --git a/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst b/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst index 6db8a5bde..20fe75039 100644 --- a/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst +++ b/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst @@ -1,7 +1,7 @@ pyhealth.tasks.daily_sport_activities =============================================== -.. autoclass:: pyhealth.tasks.daily_sport_activities.DailyAndSportActivitiesTask +.. autoclass:: pyhealth.tasks.daily_sport_activities.DailyAndSportActivitiesClassification :members: :undoc-members: :show-inheritance: \ No newline at end of file diff --git a/examples/daily_sport_activities.ipynb b/examples/daily_sport_activities.ipynb index 1b546d318..edbd04e00 100644 --- a/examples/daily_sport_activities.ipynb +++ b/examples/daily_sport_activities.ipynb @@ -259,20 +259,6 @@ "## Run the ablation" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "311188e5-b47e-40a9-bcba-71ea1677dcb6", - "metadata": {}, - "outputs": [], - "source": [ - "dataset = DailyAndSportActivitiesDataset(root=str(root))\n", - "patient = dataset.get_patient(\"p1\")\n", - "event = patient.get_events(event_type=\"daily_sport_activities\")[0]\n", - "print(event)\n", - "print(event.attr_dict.keys())" - ] - }, { "cell_type": "code", "execution_count": null, @@ -366,7 +352,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "pyhealth-project", "language": "python", "name": "python3" }, diff --git a/pyhealth/datasets/configs/daily_sport_activities.yaml b/pyhealth/datasets/configs/daily_sport_activities.yaml index e374552fc..66d0f2bd4 100644 --- a/pyhealth/datasets/configs/daily_sport_activities.yaml +++ b/pyhealth/datasets/configs/daily_sport_activities.yaml @@ -1,7 +1,5 @@ -# Author: - +# Author: Sezim Zamirbekova (szami2) and Niam Pattni (npattni2) version: "1.0" - tables: daily_sport_activities: file_path: "" diff --git a/pyhealth/datasets/daily_sport_activities.py b/pyhealth/datasets/daily_sport_activities.py index ef390c03a..271e51023 100644 --- a/pyhealth/datasets/daily_sport_activities.py +++ b/pyhealth/datasets/daily_sport_activities.py @@ -1,3 +1,23 @@ +""" +PyHealth dataset for the Daily and Sports Activity dataset. + +Dataset link: + https://archive.ics.uci.edu/dataset/256/daily+and+sports+activities + +Dataset paper: + Zhang, H.; Zhan, D.; Lin, Y.; He, J.; Zhu, Q.; Shen, Z.-J.; and + Zheng, Z. 2024. Daily Physical Activity Monitoring: Adaptive Learning + from Multi-source Motion Sensor Data. Proceedings of the fifth Conference + on Health, Inference, and Learning, volume 248 of Proceedings of Machine + Learning Research, 39–54. PMLR + +Dataset paper link: + https://raw.githubusercontent.com/mlresearch/v248/main/assets/zhang24a/zhang24a.pdf + +Authors: + Niam Pattni (npattni2@illinois.edu) + Sezim Zamirbekova (szami2@illinois.edu) +""" from __future__ import annotations from pathlib import Path @@ -15,50 +35,80 @@ class DailyAndSportActivitiesDataset(BaseDataset): - """PyHealth dataset for Daily and Sports Activities data. + """ + PyHealth dataset for Daily and Sports Activities data. This dataset parses multi-sensor time-series text files into structured samples suitable for downstream task processing. Expected folder layout example: root/ - activity_01/ - subject_01/ - segment_01.txt - segment_02.txt - activity_02/ - subject_01/ - segment_01.txt + a01/ + p01/ + s01.txt + s02.txt + a02/ + p01/ + s01.txt Each .txt file is expected to contain numeric sensor values arranged - row-wise over time. - - Parsed sample format: - { - "record_id": str, - "patient_id": str, - "activity": str, - "activity_id": int, - "segment_id": str, - "signal": np.ndarray, # shape: [time_steps, num_features] - } + row-wise over time. aXX represents an activity ID, pXX represents a subject ID, + and sXX represents a specific sensor ID. + + Parsed data format: + - "record_id": str + - "patient_id": str + - "visit_id": str + - "activity_id": str + - "activity": int + - "segment_id": str + - "file_path": str + - "signal": np.ndarray # shape: [time_steps, num_features] + + Attributes: + root (str): Root directory of the raw data. + root_path (Path): Root directory of the raw data. + config_path (str): Path to the configuration file. + activities (List[str]): Ordered list of activities by ID, activities[0] == a01. """ - activities: List[str] = ["sitting", "standing", "lying on back", "lying on right side", - "ascending stairs", "descending stairs", "standing still in elevator", + activities: List[str] = ["sitting", "standing", "lying on back", + "lying on right side", "ascending stairs", + "descending stairs", "standing still in elevator", "moving around in elevator", "walking in parking lot", - "walking on 4km/h treadmill 0 incline", "walking on 4km/h treadmill 15 incline", - "running on 8km/h treadmill", "using stair stepper", "using cross trainer", - "cycling in horizontal position", "cycling in vertical position", - "rowing", "jumping", "playing basketball"] + "walking on 4km/h treadmill 0 incline", + "walking on 4km/h treadmill 15 incline", + "running on 8km/h treadmill", "using stair stepper", + "using cross trainer", "cycling in horizontal position", + "cycling in vertical position", "rowing", "jumping", + "playing basketball"] def __init__( self, root: str = ".", - config_path: Optional[str] = str(Path(__file__).parent / "configs" / "daily_sport_activities.yaml"), + config_path: Optional[str] = str( + Path(__file__).parent / "configs" / "daily_sport_activities.yaml" + ), download: bool = False, dev: bool = False, ): + """ + Initializes the Daily and Sports Activities dataset. + + Args: + root (str): Root director of the raw data. Defaults to the current + working directory. + config_path (Optional[str]): Path to the configuration file. + Defaults to "../configs/daily_sport_activities.yaml". + download (bool): Whether to download the dataset or use an existing copy. + Defaults to False. + dev (bool): Configures parent BaseDataset. Defaults to False. + + Raises: + FileNotFoundError: If the dataset cannot be found in the specified + directory. + NotADirectoryError: If the specified root path is not a directory. + """ self.root_path = Path(root) self.root = root @@ -69,7 +119,9 @@ def __init__( raise FileNotFoundError(f"Dataset root does not exist: {self.root_path}") if not self.root_path.is_dir(): - raise NotADirectoryError(f"Dataset root is not a directory: {self.root_path}") + raise NotADirectoryError( + f"Dataset root is not a directory: {self.root_path}" + ) super().__init__( root=self.root, @@ -81,11 +133,24 @@ def __init__( @property def default_task(self): - from pyhealth.tasks.daily_sport_activities import DailyAndSportActivitiesTask - return DailyAndSportActivitiesTask(signal_loader=self.load_signal) + """ + Returns the default task for this dataset. + + Returns: + DailyAndSportActivitiesClassification: The default classification task. + + Example:: + >>> dataset = DailyAndSportActivitiesDataset() + >>> task = dataset.default_task + """ + from pyhealth.tasks.daily_sport_activities import ( + DailyAndSportActivitiesClassification + ) + return DailyAndSportActivitiesClassification(signal_loader=self.load_signal) def _download(self, root: str) -> None: - """Downloads the Daily and Sports Activities dataset and extracts the compressed data. + """Downloads the Daily and Sports Activities dataset and extracts the + compressed data. Args: root (str): Root directory of raw data. @@ -105,18 +170,41 @@ def _download(self, root: str) -> None: zf.extractall(root_path) def _discover_files(self) -> List[Path]: - """Find all text files under the dataset root.""" + """ + Find all text files under the dataset root. + + Returns: + List[Path]: List of all paths to relevant text files + + Raises: + FileNotFoundError: No text files exist in the specified root path. + """ txt_files = sorted(self.root_path.rglob("*.txt")) if not txt_files: raise FileNotFoundError( - f"No .txt files found under dataset root: {root_path.root}" + f"No .txt files found under dataset root: {self.root_path.root}" ) return txt_files def _infer_metadata_from_path(self, file_path: Path) -> Dict[str, str]: - """Infer activity, subject, and segment identifiers from the file path.""" + """ + Infer activity, subject, and segment identifiers from the file path. + + Args: + file_path (Path): The path to the given text file. + + Returns: + Dict[str, str]: Map from metadata name to value for record, activity, + patient, and segment IDs + + Raises: + ValueError: Folder structure does not follow aXX/pXX/sXX.txt. + ValueError: Activity folder name doesn't start with a. + ValueError: Patient folder name doesn't start with p. + ValueError: Segment file name doesn't start with s. + """ relative_parts = file_path.relative_to(self.root_path).parts if len(relative_parts) < 3: @@ -145,13 +233,30 @@ def _infer_metadata_from_path(self, file_path: Path) -> Dict[str, str]: } def load_signal(self, file_path: str | Path) -> np.ndarray: - """Load and validate a 125 x 45 sensor matrix from a text file.""" + """ + Load and validate a 125 x 45 sensor matrix from a text file. + + Args: + file_path (str | Path): The path to the text file to load. + + Returns: + np.ndarray: The value loaded from the text file of the sensor. + + Raises: + ValueError: Couldn't read numeric data from the file. + ValueError: Empty target file. + ValueError: Shape of parsed data is not 2D. + ValueError: Shape of parsed data is not (125, 45). + ValueError: Parsed data contains NaN or Inf values. + """ file_path = Path(file_path) try: signal = np.loadtxt(file_path, delimiter=",", dtype=np.float32) except Exception as e: - raise ValueError(f"Failed to parse numeric data from {file_path}: {e}") from e + raise ValueError( + f"Failed to parse numeric data from {file_path}: {e}" + ) from e if signal.size == 0: raise ValueError(f"Empty signal file: {file_path}") @@ -161,7 +266,8 @@ def load_signal(self, file_path: str | Path) -> np.ndarray: if signal.ndim != 2: raise ValueError( - f"Signal in {file_path} must be 2D after parsing, got shape {signal.shape}" + f"""Signal in {file_path} must be 2D after parsing, got shape + {signal.shape}""" ) if signal.shape != (125, 45): @@ -175,12 +281,33 @@ def load_signal(self, file_path: str | Path) -> np.ndarray: return signal def _get_activity_name(self, activity_id: str) -> str: + """ + Get activity name from a given ID. + + Args: + activity_id (str): The ID number XX from the folder "root/aXX/...". + + Returns: + str: The corresponding activity name from activities[XX - 1] + + Raises: + ValueError: Activity ID is not within 1 to 19 inclusive. + """ idx = int(activity_id[1:]) - 1 if idx < 0 or idx >= len(self.activities): raise ValueError(f"Invalid activity_id: {activity_id}") return self.activities[idx] def _parse_file_to_event_row(self, file_path: Path) -> Dict[str, Any]: + """ + Reads a target file and folder structure and parses it into an event row. + + Args: + file_path (Path): Path to target dataset file. + + Returns: + Dict[str, Any]: Maps attribute name to value from parsed data + """ metadata = self._infer_metadata_from_path(file_path) signal = self.load_signal(file_path) @@ -203,7 +330,15 @@ def _parse_file_to_event_row(self, file_path: Path) -> Dict[str, Any]: } def load_data(self) -> dd.DataFrame: - """Load raw segment files into a PyHealth-compatible event dataframe.""" + """ + Load raw segment files into a PyHealth-compatible event dataframe. + + Returns: + dd.DataFrame: Dask dataframe of event rows from parsed data. + + Raises: + ValueError: No valid parsed data exists. + """ rows: List[Dict[str, Any]] = [] txt_files = self._discover_files() @@ -221,7 +356,15 @@ def load_data(self) -> dd.DataFrame: return dd.from_pandas(pdf, npartitions=1) def parse_data(self) -> List[Dict]: - """Debug helper: parse raw files into in-memory samples.""" + """ + Debug helper: parse raw files into in-memory samples. + + Returns: + List[Dict[str, Any]]: List of event rows from parsed data. + + Raises: + ValueError: No valid parsed data exists. + """ samples: List[Dict[str, Any]] = [] txt_files = self._discover_files() diff --git a/pyhealth/tasks/__init__.py b/pyhealth/tasks/__init__.py index 4a8889864..8392d9131 100644 --- a/pyhealth/tasks/__init__.py +++ b/pyhealth/tasks/__init__.py @@ -67,4 +67,4 @@ VariantClassificationClinVar, ) from .patient_linkage_mimic3 import PatientLinkageMIMIC3Task -from .daily_sport_activities import DailyAndSportActivitiesTask +from .daily_sport_activities import DailyAndSportActivitiesClassification diff --git a/pyhealth/tasks/daily_sport_activities.py b/pyhealth/tasks/daily_sport_activities.py index aae9b89a2..a58371124 100644 --- a/pyhealth/tasks/daily_sport_activities.py +++ b/pyhealth/tasks/daily_sport_activities.py @@ -1,6 +1,26 @@ +""" +PyHealth task for classification using the Daily and Sports Activity dataset. + +Dataset link: + https://archive.ics.uci.edu/dataset/256/daily+and+sports+activities + +Dataset paper: + Zhang, H.; Zhan, D.; Lin, Y.; He, J.; Zhu, Q.; Shen, Z.-J.; and + Zheng, Z. 2024. Daily Physical Activity Monitoring: Adaptive Learning + from Multi-source Motion Sensor Data. Proceedings of the fifth Conference + on Health, Inference, and Learning, volume 248 of Proceedings of Machine + Learning Research, 39–54. PMLR + +Dataset paper link: + https://raw.githubusercontent.com/mlresearch/v248/main/assets/zhang24a/zhang24a.pdf + +Authors: + Niam Pattni (npattni2@illinois.edu) + Sezim Zamirbekova (szami2@illinois.edu) +""" from __future__ import annotations -from typing import Dict, List, Optional +from typing import Callable, Dict, List, Optional import numpy as np @@ -9,16 +29,40 @@ def _normalize_signal(signal: np.ndarray) -> np.ndarray: - """Apply per-feature z-score normalization.""" + """ + Apply per-feature z-score normalization. + + Args: + signal (np.ndarray): The signal tensor to normalize. + + Returns: + np.ndarray: The normalized signal. + """ mean = signal.mean(axis=0, keepdims=True) std = signal.std(axis=0, keepdims=True) std = np.where(std < 1e-8, 1.0, std) return (signal - mean) / std def _validate_and_select_features( - signal: np.ndarray, selected_features: Optional[List[int]] + signal: np.ndarray, + selected_features: Optional[List[int]], ) -> np.ndarray: - """Validate optional feature indices and subset the signal.""" + """ + Validate optional feature indices and subset the signal. + + Args: + signal (np.ndarray): The original signal before feature selection. + selected_features (Optional[List[int]]): Features to select from the + specified signal. + + Returns: + np.ndarray: The signal after feature selection. + + Raises: + ValueError: selected_features is empty. + ValueError: There is an index in selected_features which is out of bounds + for the signal. + """ if selected_features is None: return signal @@ -29,13 +73,32 @@ def _validate_and_select_features( for idx in selected_features: if idx < 0 or idx >= n_features: raise ValueError( - f"Feature index {idx} is out of bounds for signal with {n_features} features." + f"""Feature index {idx} is out of bounds for signal with {n_features} + features.""" ) return signal[:, selected_features] -def _sliding_windows(signal: np.ndarray, window_size: int, stride: int) -> List[np.ndarray]: - """Split a sequence into fixed-size sliding windows.""" +def _sliding_windows( + signal: np.ndarray, + window_size: int, + stride: int, +) -> List[np.ndarray]: + """ + Split a sequence into fixed-size sliding windows. + + Args: + signal (np.ndarray): The signal to split. + window_size (int): The size of the sliding window. + stride (int): How far to slide the window at each step. + + Returns: + List[np.ndarray]: Windows generated from the signal. + + Raises: + ValueError: Window size is negative. + ValueError: Stride is negative. + """ if window_size <= 0: raise ValueError(f"window_size must be positive, got {window_size}") @@ -54,9 +117,24 @@ def _sliding_windows(signal: np.ndarray, window_size: int, stride: int) -> List[ return windows -class DailyAndSportActivitiesTask(BaseTask): - """Create task samples for activity recognition from DailyAndSportActivitiesDataset.""" - task_name: str = "DailyAndSportActivitiesTask" +class DailyAndSportActivitiesClassification(BaseTask): + """ + A PyHealth task class for classification of activities in the Daily and + Sport Activities dataset. + + Attributes: + task_name (str): The name of the task. + input_schema (Dict[str, str]): The schema for the task input. + output_schema (Dict[str, str]): The schema for the task output. + + Examples: + >>> from pyhealth.datasets import DailyAndSportActivitiesDataset + >>> from pyhealth.tasks import DailyAndSportActivitiesClassification + >>> dataset = DailyAndSportActivitiesDataset(download=True) + >>> task = DailyAndSportActivitiesyClassification() + >>> samples = dataset.set_task(task) + """ + task_name: str = "DailyAndSportActivitiesClassification" input_schema: Dict[str, str] = {"signal": "tensor"} output_schema: Dict[str, str] = {"label": "multiclass"} @@ -66,8 +144,25 @@ def __init__( stride: int = 25, normalize: bool = True, selected_features: Optional[List[int]] = None, - signal_loader=None, + signal_loader: Callable = None, ) -> None: + """ + Initializes the DailyAndSportActivitiesClassification task. + + Args: + window_size (int): The size of the sliding window on the input signal. + Defaults to 50. + stride (int): The size of the sliding window move. Defauts to 25. + normalize (bool): Should the signal data be normalized. Defaults to True. + selected_features (Optional[List[int]]): Features to select from the signal. + Defaults to None (all features). + signal_loader (Callable): The function to use for parsing signal data. + Defaults to None. + + Raises: + ValueError: Window size is negative. + ValueError: Stride is negative. + """ if window_size <= 0: raise ValueError(f"window_size must be positive, got {window_size}") if stride <= 0: @@ -80,7 +175,15 @@ def __init__( self.signal_loader = signal_loader def __call__(self, patient: Patient) -> List[Dict]: - """Generate activity-recognition samples for one patient.""" + """ + Generate activity-recognition samples for a single patient. + + Args: + patient (Patient): The patient to generate samples for. + + Returns: + List[Dict]: The list of samples for the specified patient. + """ events: List[Event] = patient.get_events(event_type="daily_sport_activities") samples: List[Dict] = [] diff --git a/tests/core/test_daily_sport_activities.py b/tests/core/test_daily_sport_activities.py index 0924be338..05fd4e537 100644 --- a/tests/core/test_daily_sport_activities.py +++ b/tests/core/test_daily_sport_activities.py @@ -1,3 +1,11 @@ +""" +Unit tests for the DailyAndSportActivitiesDataset and +DailyAndSportActivitiesClassification classes. + +Authors: + Niam Pattni (npattni2@illinois.edu) + Sezim Zamirbekova (szami2@illinois.edu) +""" from pathlib import Path import numpy as np @@ -6,11 +14,23 @@ from pyhealth.datasets.daily_sport_activities import DailyAndSportActivitiesDataset from pyhealth.tasks.daily_sport_activities import ( - DailyAndSportActivitiesTask, + DailyAndSportActivitiesClassification, ) -def _write_fake_signal_file(file_path: Path, shape=(125, 45), seed: int = 0): +def _write_fake_signal_file( + file_path: Path, + shape: tuple[int, int] = (125, 45), + seed: int = 0, +): + """ + Creates a random fake signal to use in test cases. + + Args: + file_path (Path): The desination path of the file. + shape (tuple[int, int]): The shape of the random signal. Defaults to (125, 45). + seed (int): Seed to reproduce random signal. Defaults to 0. + """ rng = np.random.default_rng(seed) data = rng.normal(size=shape).astype(np.float32) @@ -42,6 +62,9 @@ def _make_fake_dataset(root: Path): s01.txt p8/ s01.txt + + Args: + root (Path): The root directory to create the dummy folder structure. """ files = [ ("a01", "p1", "s01.txt"), @@ -95,7 +118,9 @@ def test_invalid_shape_raises_error(tmp_path): bad_file = tmp_path / "daily_sport_activities" / "a01" / "p1" / "s01.txt" _write_fake_signal_file(bad_file, shape=(124, 45), seed=123) - dataset = DailyAndSportActivitiesDataset(root=str(tmp_path / "daily_sport_activities")) + dataset = DailyAndSportActivitiesDataset( + root=str(tmp_path / "daily_sport_activities") + ) with pytest.raises(ValueError, match="must have shape"): dataset.parse_data() @@ -163,7 +188,7 @@ def test_set_task_generates_samples(tmp_path): _make_fake_dataset(data_root) dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - task = DailyAndSportActivitiesTask( + task = DailyAndSportActivitiesClassification( window_size=50, stride=25, normalize=True, @@ -188,7 +213,7 @@ def test_task_selected_features_reduces_dimension(tmp_path): _make_fake_dataset(data_root) dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - task = DailyAndSportActivitiesTask( + task = DailyAndSportActivitiesClassification( window_size=50, stride=25, selected_features=[0, 1, 2, 3], @@ -204,7 +229,7 @@ def test_task_invalid_feature_index_raises(tmp_path): _make_fake_dataset(data_root) dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - task = DailyAndSportActivitiesTask( + task = DailyAndSportActivitiesClassification( window_size=50, stride=25, selected_features=[999], @@ -219,7 +244,7 @@ def test_task_window_too_large_raises(tmp_path): _make_fake_dataset(data_root) dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - task = DailyAndSportActivitiesTask( + task = DailyAndSportActivitiesClassification( window_size=200, stride=25, ) From c6f2581692bb137835786455d994d407de63da14 Mon Sep 17 00:00:00 2001 From: Niam Pattni Date: Mon, 20 Apr 2026 22:01:36 -0400 Subject: [PATCH 3/5] Added interpretation to ablation study --- examples/daily_sport_activities.ipynb | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/examples/daily_sport_activities.ipynb b/examples/daily_sport_activities.ipynb index edbd04e00..ee7075d52 100644 --- a/examples/daily_sport_activities.ipynb +++ b/examples/daily_sport_activities.ipynb @@ -316,6 +316,19 @@ "results_df.sort_values(\"accuracy\", ascending=False).reset_index(drop=True)" ] }, + { + "cell_type": "markdown", + "id": "eb351598", + "metadata": {}, + "source": [ + "## Brief interpretation\n", + "\n", + "- Larger windows (50) outperform smaller windows (25), likely due to better temporal context.\n", + "- Normalization **decreased** performance in this setup, suggesting the model may already handle raw scale well or synthetic signal structure was distorted.\n", + "- Smaller windows increase sample count but may reduce per-sample information, hurting performance.\n", + "- Note: Results are based on synthetic data; trends illustrate pipeline behavior rather than real-world performance." + ] + }, { "cell_type": "code", "execution_count": null, From 51fd03821dc29655e9b3c2ef92c8af1c6bf7eed2 Mon Sep 17 00:00:00 2001 From: Niam Pattni Date: Mon, 20 Apr 2026 22:35:08 -0400 Subject: [PATCH 4/5] Fixed class naming in example --- examples/daily_sport_activities.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/daily_sport_activities.ipynb b/examples/daily_sport_activities.ipynb index ee7075d52..a45c760f3 100644 --- a/examples/daily_sport_activities.ipynb +++ b/examples/daily_sport_activities.ipynb @@ -5,7 +5,7 @@ "id": "3d4f2a5e", "metadata": {}, "source": [ - "# DailySportActivities RNN Ablation Study\n", + "# Daily And Sport Activities RNN Ablation Study\n", "\n", "This notebook demonstrates an ablation study for the Daily and Sports Activities dataset using a built-in PyHealth RNN model.\n", "\n", @@ -38,7 +38,7 @@ "import numpy as np\n", "\n", "from pyhealth.datasets.daily_sport_activities import DailyAndSportActivitiesDataset\n", - "from pyhealth.tasks.daily_sport_activities import DailyAndSportActivitiesTask\n", + "from pyhealth.tasks.daily_sport_activities import DailyAndSportActivitiesClassification\n", "from pyhealth.datasets import split_by_patient, get_dataloader\n", "from pyhealth.models import RNN\n", "from pyhealth.trainer import Trainer\n", @@ -181,7 +181,7 @@ "source": [ "def run_one_config(root: Path, cfg: dict) -> dict:\n", " dataset = DailyAndSportActivitiesDataset(root=str(root))\n", - " task = DailyAndSportActivitiesTask(**cfg, signal_loader=dataset.load_signal)\n", + " task = DailyAndSportActivitiesClassification(**cfg, signal_loader=dataset.load_signal)\n", "\n", " sample_dataset = dataset.set_task(task)\n", "\n", From 1659558ff6fab7c214a8dbbeaec5c572e838624a Mon Sep 17 00:00:00 2001 From: Niam Pattni Date: Tue, 21 Apr 2026 09:49:27 -0400 Subject: [PATCH 5/5] Use unittest for testing --- ...ealth.datasets.DailyAndSportActivities.rst | 4 +- ...pyhealth.tasks.DailyAndSportActivities.rst | 4 +- examples/daily_sport_activities.ipynb | 39 +- pyhealth/datasets/daily_sport_activities.py | 48 +- pyhealth/tasks/daily_sport_activities.py | 7 +- tests/core/test_daily_sport_activities.py | 489 +++++++++--------- 6 files changed, 265 insertions(+), 326 deletions(-) diff --git a/docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst b/docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst index 46b7ebf3c..b588d91ec 100644 --- a/docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst +++ b/docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst @@ -1,9 +1,9 @@ -pyhealth.datasets.daily_sport_activities +pyhealth.datasets.DailyAndSportActivitiesDataset ======================================== The Daily and Sport Activities dataset. For more information see `here `_. -.. autoclass: pyhealth.datasets.daily_sport_activities.DailyAndSportActivitiesDataset +.. autoclass: pyhealth.datasets.DailyAndSportActivitiesDataset :members: :undoc-members: :show-inheritance: \ No newline at end of file diff --git a/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst b/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst index 20fe75039..1461c30db 100644 --- a/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst +++ b/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst @@ -1,7 +1,7 @@ -pyhealth.tasks.daily_sport_activities +pyhealth.tasks.DailyAndSportActivitiesClassification =============================================== -.. autoclass:: pyhealth.tasks.daily_sport_activities.DailyAndSportActivitiesClassification +.. autoclass:: pyhealth.tasks.DailyAndSportActivitiesClassification :members: :undoc-members: :show-inheritance: \ No newline at end of file diff --git a/examples/daily_sport_activities.ipynb b/examples/daily_sport_activities.ipynb index a45c760f3..c5cce666b 100644 --- a/examples/daily_sport_activities.ipynb +++ b/examples/daily_sport_activities.ipynb @@ -37,14 +37,14 @@ "\n", "import numpy as np\n", "\n", - "from pyhealth.datasets.daily_sport_activities import DailyAndSportActivitiesDataset\n", - "from pyhealth.tasks.daily_sport_activities import DailyAndSportActivitiesClassification\n", + "from pyhealth.datasets import DailyAndSportActivitiesDataset\n", + "from pyhealth.tasks import DailyAndSportActivitiesClassification\n", "from pyhealth.datasets import split_by_patient, get_dataloader\n", "from pyhealth.models import RNN\n", "from pyhealth.trainer import Trainer\n", "\n", - "random.seed(42)\n", - "np.random.seed(42)" + "random.seed(123)\n", + "np.random.seed(123)" ] }, { @@ -136,27 +136,6 @@ "root" ] }, - { - "cell_type": "markdown", - "id": "2cbeacf8", - "metadata": {}, - "source": [ - "## Quick dataset sanity check" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "947fb475", - "metadata": {}, - "outputs": [], - "source": [ - "dataset = DailyAndSportActivitiesDataset(root=str(root))\n", - "raw_samples = dataset.parse_data()\n", - "\n", - "len(raw_samples), raw_samples[0][\"signal\"].shape, raw_samples[0][\"activity\"]" - ] - }, { "cell_type": "markdown", "id": "08e695fe", @@ -181,7 +160,7 @@ "source": [ "def run_one_config(root: Path, cfg: dict) -> dict:\n", " dataset = DailyAndSportActivitiesDataset(root=str(root))\n", - " task = DailyAndSportActivitiesClassification(**cfg, signal_loader=dataset.load_signal)\n", + " task = DailyAndSportActivitiesClassification(signal_loader=dataset.load_signal, **cfg)\n", "\n", " sample_dataset = dataset.set_task(task)\n", "\n", @@ -353,14 +332,6 @@ "source": [ "tmpdir.cleanup()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "44dcdd98-17c9-457b-95b8-8e9b5c6e1c93", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/pyhealth/datasets/daily_sport_activities.py b/pyhealth/datasets/daily_sport_activities.py index 271e51023..47fb53ff3 100644 --- a/pyhealth/datasets/daily_sport_activities.py +++ b/pyhealth/datasets/daily_sport_activities.py @@ -44,15 +44,15 @@ class DailyAndSportActivitiesDataset(BaseDataset): Expected folder layout example: root/ a01/ - p01/ + p1/ s01.txt s02.txt a02/ - p01/ + p1/ s01.txt Each .txt file is expected to contain numeric sensor values arranged - row-wise over time. aXX represents an activity ID, pXX represents a subject ID, + row-wise over time. aXX represents an activity ID, p* represents a subject ID, and sXX represents a specific sensor ID. Parsed data format: @@ -143,9 +143,7 @@ def default_task(self): >>> dataset = DailyAndSportActivitiesDataset() >>> task = dataset.default_task """ - from pyhealth.tasks.daily_sport_activities import ( - DailyAndSportActivitiesClassification - ) + from pyhealth.tasks import DailyAndSportActivitiesClassification return DailyAndSportActivitiesClassification(signal_loader=self.load_signal) def _download(self, root: str) -> None: @@ -200,7 +198,7 @@ def _infer_metadata_from_path(self, file_path: Path) -> Dict[str, str]: patient, and segment IDs Raises: - ValueError: Folder structure does not follow aXX/pXX/sXX.txt. + ValueError: Folder structure does not follow aXX/p*/sXX.txt. ValueError: Activity folder name doesn't start with a. ValueError: Patient folder name doesn't start with p. ValueError: Segment file name doesn't start with s. @@ -354,39 +352,3 @@ def load_data(self) -> dd.DataFrame: pdf["timestamp"] = pd.NaT return dd.from_pandas(pdf, npartitions=1) - - def parse_data(self) -> List[Dict]: - """ - Debug helper: parse raw files into in-memory samples. - - Returns: - List[Dict[str, Any]]: List of event rows from parsed data. - - Raises: - ValueError: No valid parsed data exists. - """ - samples: List[Dict[str, Any]] = [] - txt_files = self._discover_files() - - for file_path in txt_files: - metadata = self._infer_metadata_from_path(file_path) - signal = self.load_signal(file_path) - activity_name = self._get_activity_name(metadata["activity_id"]) - - samples.append( - { - "record_id": metadata["record_id"], - "patient_id": metadata["patient_id"], - "visit_id": metadata["segment_id"], - "activity_id": metadata["activity_id"], - "activity": activity_name, - "segment_id": metadata["segment_id"], - "file_path": str(file_path), - "signal": signal, - } - ) - - if not samples: - raise ValueError("No samples were parsed from the dataset.") - - return samples diff --git a/pyhealth/tasks/daily_sport_activities.py b/pyhealth/tasks/daily_sport_activities.py index a58371124..4cbe616ad 100644 --- a/pyhealth/tasks/daily_sport_activities.py +++ b/pyhealth/tasks/daily_sport_activities.py @@ -140,24 +140,23 @@ class DailyAndSportActivitiesClassification(BaseTask): def __init__( self, + signal_loader: Callable, window_size: int = 50, stride: int = 25, normalize: bool = True, selected_features: Optional[List[int]] = None, - signal_loader: Callable = None, ) -> None: """ Initializes the DailyAndSportActivitiesClassification task. Args: + signal_loader (Callable): The function to use for parsing signal data. window_size (int): The size of the sliding window on the input signal. Defaults to 50. stride (int): The size of the sliding window move. Defauts to 25. normalize (bool): Should the signal data be normalized. Defaults to True. selected_features (Optional[List[int]]): Features to select from the signal. Defaults to None (all features). - signal_loader (Callable): The function to use for parsing signal data. - Defaults to None. Raises: ValueError: Window size is negative. @@ -168,11 +167,11 @@ def __init__( if stride <= 0: raise ValueError(f"stride must be positive, got {stride}") + self.signal_loader = signal_loader self.window_size = window_size self.stride = stride self.normalize = normalize self.selected_features = selected_features - self.signal_loader = signal_loader def __call__(self, patient: Patient) -> List[Dict]: """ diff --git a/tests/core/test_daily_sport_activities.py b/tests/core/test_daily_sport_activities.py index 05fd4e537..7e7951c1b 100644 --- a/tests/core/test_daily_sport_activities.py +++ b/tests/core/test_daily_sport_activities.py @@ -7,247 +7,254 @@ Sezim Zamirbekova (szami2@illinois.edu) """ from pathlib import Path +import shutil +import unittest import numpy as np import pandas as pd -import pytest - -from pyhealth.datasets.daily_sport_activities import DailyAndSportActivitiesDataset -from pyhealth.tasks.daily_sport_activities import ( - DailyAndSportActivitiesClassification, -) - - -def _write_fake_signal_file( - file_path: Path, - shape: tuple[int, int] = (125, 45), - seed: int = 0, -): - """ - Creates a random fake signal to use in test cases. - - Args: - file_path (Path): The desination path of the file. - shape (tuple[int, int]): The shape of the random signal. Defaults to (125, 45). - seed (int): Seed to reproduce random signal. Defaults to 0. - """ - rng = np.random.default_rng(seed) - data = rng.normal(size=shape).astype(np.float32) - - file_path.parent.mkdir(parents=True, exist_ok=True) - pd.DataFrame(data).to_csv( - file_path, - header=False, - index=False, - ) - -def _make_fake_dataset(root: Path): - """ - Creates a tiny synthetic dataset with the same folder structure as the real one. - - Structure: - root/ - a01/ - p1/ - s01.txt - s02.txt - p7/ - s01.txt - p8/ - s01.txt - a02/ - p1/ - s01.txt - p7/ - s01.txt - p8/ - s01.txt - - Args: - root (Path): The root directory to create the dummy folder structure. - """ - files = [ - ("a01", "p1", "s01.txt"), - ("a01", "p1", "s02.txt"), - ("a01", "p7", "s01.txt"), - ("a01", "p8", "s01.txt"), - ("a02", "p1", "s01.txt"), - ("a02", "p7", "s01.txt"), - ("a02", "p8", "s01.txt"), - ] - - for i, (activity, subject, segment) in enumerate(files): - path = root / activity / subject / segment - _write_fake_signal_file(path, shape=(125, 45), seed=i) - - -def test_parse_data_loads_all_samples(tmp_path): - data_root = tmp_path / "daily_sport_activities" - _make_fake_dataset(data_root) - - dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - samples = dataset.parse_data() - - assert len(samples) == 7 - assert isinstance(samples, list) - - first = samples[0] - assert "record_id" in first - assert "patient_id" in first - assert "visit_id" in first - assert "activity_id" in first - assert "activity" in first - assert "segment_id" in first - assert "file_path" in first - assert "signal" in first - - -def test_signal_shape_is_correct(tmp_path): - data_root = tmp_path / "daily_sport_activities" - _make_fake_dataset(data_root) - - dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - samples = dataset.parse_data() - - for sample in samples: - assert sample["signal"].shape == (125, 45) - assert sample["signal"].dtype == np.float32 - - -def test_invalid_shape_raises_error(tmp_path): - bad_file = tmp_path / "daily_sport_activities" / "a01" / "p1" / "s01.txt" - _write_fake_signal_file(bad_file, shape=(124, 45), seed=123) - - dataset = DailyAndSportActivitiesDataset( - root=str(tmp_path / "daily_sport_activities") - ) - - with pytest.raises(ValueError, match="must have shape"): - dataset.parse_data() - - -def test_missing_root_raises_file_not_found(): - with pytest.raises(FileNotFoundError): - DailyAndSportActivitiesDataset(root="this/path/does/not/exist") - - -def test_load_data_returns_event_dataframe(tmp_path): - data_root = tmp_path / "daily_sport_activities" - _make_fake_dataset(data_root) - - dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - df = dataset.load_data().compute() - - assert len(df) == 7 - assert "patient_id" in df.columns - assert "event_type" in df.columns - assert "timestamp" in df.columns - assert "daily_sport_activities/file_path" in df.columns - assert "daily_sport_activities/activity_id" in df.columns - assert "daily_sport_activities/activity" in df.columns - - assert set(df["event_type"].unique()) == {"daily_sport_activities"} - - -def test_get_patient_returns_expected_events(tmp_path): - data_root = tmp_path / "daily_sport_activities" - _make_fake_dataset(data_root) - - dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - patient = dataset.get_patient("p1") - events = patient.get_events(event_type="daily_sport_activities") - - assert len(events) == 3 - - event = events[0] - assert "daily_sport_activities/file_path" in event - assert "daily_sport_activities/activity_id" in event - assert "daily_sport_activities/activity" in event - assert "daily_sport_activities/visit_id" in event - - -def test_event_metadata_is_parsed_correctly(tmp_path): - data_root = tmp_path / "daily_sport_activities" - _make_fake_dataset(data_root) - - dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - df = dataset.load_data().compute() - - row = df.iloc[0] - - assert row["patient_id"] in {"p1", "p7", "p8"} - assert row["daily_sport_activities/activity_id"] in {"a01", "a02"} - assert row["daily_sport_activities/activity"] in {"sitting", "standing"} - assert row["daily_sport_activities/visit_id"] in {"s01", "s02"} - assert row["daily_sport_activities/n_rows"] == 125 - assert row["daily_sport_activities/n_cols"] == 45 - - -def test_set_task_generates_samples(tmp_path): - data_root = tmp_path / "daily_sport_activities" - _make_fake_dataset(data_root) - - dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - task = DailyAndSportActivitiesClassification( - window_size=50, - stride=25, - normalize=True, - ) - samples = dataset.set_task(task) - - assert len(samples) > 0 - - sample = samples[0] - assert "patient_id" in sample - assert "visit_id" in sample - assert "record_id" in sample - assert "signal" in sample - assert "label" in sample - - assert sample["signal"].shape == (50, 45) - assert isinstance(sample["label"], (int, np.integer)) - - -def test_task_selected_features_reduces_dimension(tmp_path): - data_root = tmp_path / "daily_sport_activities" - _make_fake_dataset(data_root) - - dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - task = DailyAndSportActivitiesClassification( - window_size=50, - stride=25, - selected_features=[0, 1, 2, 3], - ) - samples = dataset.set_task(task) - - assert len(samples) > 0 - assert samples[0]["signal"].shape == (50, 4) - - -def test_task_invalid_feature_index_raises(tmp_path): - data_root = tmp_path / "daily_sport_activities" - _make_fake_dataset(data_root) - - dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - task = DailyAndSportActivitiesClassification( - window_size=50, - stride=25, - selected_features=[999], - ) - - with pytest.raises(ValueError, match="out of bounds"): - dataset.set_task(task) - - -def test_task_window_too_large_raises(tmp_path): - data_root = tmp_path / "daily_sport_activities" - _make_fake_dataset(data_root) - - dataset = DailyAndSportActivitiesDataset(root=str(data_root)) - task = DailyAndSportActivitiesClassification( - window_size=200, - stride=25, - ) - - with pytest.raises(ValueError): - dataset.set_task(task) +import torch + +from pyhealth.datasets import DailyAndSportActivitiesDataset +from pyhealth.tasks import DailyAndSportActivitiesClassification + +class TestDailyAndSportActivityDataset(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.root = ( + Path(__file__).parent.parent.parent + / "test-resources" + / "core" + / "daily-sport-activities" + ) + cls.generate_fake_dataset() + cls.dataset = DailyAndSportActivitiesDataset(cls.root) + cls.samples = cls.dataset.set_task(cls.dataset.default_task) + + @classmethod + def tearDownClass(cls): + cls.samples.close() + for activity_dir in cls.root.glob("a*"): + if activity_dir.is_dir(): + shutil.rmtree(activity_dir) + + @classmethod + def generate_fake_dataset(cls): + """ + Creates a tiny synthetic dataset with the same folder structure as the real one. + + Structure: + root/ + a01/ + p1/ + s01.txt + s02.txt + p7/ + s01.txt + p8/ + s01.txt + a02/ + p1/ + s01.txt + p7/ + s01.txt + p8/ + s01.txt + + Args: + root (Path): The root directory to create the dummy folder structure. + """ + files = [ + ("a01", "p1", "s01.txt"), + ("a01", "p1", "s02.txt"), + ("a01", "p7", "s01.txt"), + ("a01", "p8", "s01.txt"), + ("a02", "p1", "s01.txt"), + ("a02", "p7", "s01.txt"), + ("a02", "p8", "s01.txt"), + ] + + rng = np.random.default_rng(123) + + for activity, subject, segment in files: + path = cls.root / activity / subject / segment + path.parent.mkdir(parents=True, exist_ok=True) + data = rng.normal(size=(125, 45)).astype(np.float32) + np.savetxt(path, data, delimiter=",", fmt="%.6f") + + def test_stats(self): + self.dataset.stats() + + def test_num_patients(self): + self.assertEqual(len(self.dataset.unique_patient_ids), 3) + + def test_load_signal_returns_expected_shape_and_dtype(self): + file_path = self.root / "a01" / "p1" / "s01.txt" + signal = self.dataset.load_signal(file_path) + + self.assertEqual(signal.shape, (125, 45)) + self.assertEqual(signal.dtype, np.float32) + + def test_load_signal_invalid_shape_raises(self): + bad_path = self.root / "a03" / "p1" / "s01.txt" + bad_path.parent.mkdir(parents=True, exist_ok=True) + + bad_signal = np.random.randn(124, 45).astype(np.float32) + np.savetxt(bad_path, bad_signal, delimiter=",", fmt="%.6f") + + with self.assertRaises(ValueError): + self.dataset.load_signal(bad_path) + + bad_path.unlink() + bad_path.parent.rmdir() + bad_path.parent.parent.rmdir() + + def test_load_data_returns_event_dataframe(self): + df = self.dataset.load_data().compute() + + self.assertEqual(len(df), 7) + self.assertIn("patient_id", df.columns) + self.assertIn("event_type", df.columns) + self.assertIn("timestamp", df.columns) + self.assertIn("daily_sport_activities/record_id", df.columns) + self.assertIn("daily_sport_activities/visit_id", df.columns) + self.assertIn("daily_sport_activities/activity_id", df.columns) + self.assertIn("daily_sport_activities/activity", df.columns) + self.assertIn("daily_sport_activities/file_path", df.columns) + self.assertIn("daily_sport_activities/n_rows", df.columns) + self.assertIn("daily_sport_activities/n_cols", df.columns) + + self.assertEqual(set(df["event_type"].unique()), {"daily_sport_activities"}) + + def test_get_patient_p1(self): + events = self.dataset.get_patient("p1").get_events( + event_type="daily_sport_activities" + ) + + self.assertEqual(len(events), 3) + + self.assertEqual(events[0]["visit_id"], "s01") + self.assertEqual(events[1]["visit_id"], "s02") + self.assertIn(events[2]["activity"], {"sitting", "standing"}) + + def test_get_patient_p7(self): + events = self.dataset.get_patient("p7").get_events( + event_type="daily_sport_activities" + ) + + self.assertEqual(len(events), 2) + self.assertEqual(set(event["activity_id"] for event in events), {"a01", "a02"}) + + def test_get_patient_p8(self): + events = self.dataset.get_patient("p8").get_events( + event_type="daily_sport_activities" + ) + + self.assertEqual(len(events), 2) + self.assertEqual(set(event["activity_id"] for event in events), {"a01", "a02"}) + + def test_event_metadata_is_parsed_correctly(self): + df = self.dataset.load_data().compute() + row = df.iloc[0] + + self.assertIn(row["patient_id"], {"p1", "p7", "p8"}) + self.assertIn(row["daily_sport_activities/activity_id"], {"a01", "a02"}) + self.assertIn(row["daily_sport_activities/activity"], {"sitting", "standing"}) + self.assertIn(row["daily_sport_activities/visit_id"], {"s01", "s02"}) + self.assertEqual(row["daily_sport_activities/n_rows"], 125) + self.assertEqual(row["daily_sport_activities/n_cols"], 45) + self.assertEqual(row["daily_sport_activities/sampling_rate_hz"], 25) + self.assertEqual(row["daily_sport_activities/duration_seconds"], 5) + + def test_default_task(self): + self.assertIsInstance( + self.dataset.default_task, + DailyAndSportActivitiesClassification, + ) + + def test_task_generates_expected_number_of_samples(self): + self.assertEqual(len(self.samples), 28) + + def test_task_sample_structure(self): + sample = self.samples[0] + + self.assertIn("patient_id", sample) + self.assertIn("visit_id", sample) + self.assertIn("record_id", sample) + self.assertIn("signal", sample) + self.assertIn("label", sample) + + self.assertEqual(sample["signal"].shape, (50, 45)) + self.assertEqual(sample["signal"].dtype, torch.float32) + self.assertIsInstance(sample["label"], torch.Tensor) + self.assertEqual(sample["label"].dtype, torch.int64) + + def test_task_labels_are_correct(self): + labels = [sample["label"] for sample in self.samples] + self.assertEqual(labels.count(0), 16) + self.assertEqual(labels.count(1), 12) + + def test_task_selected_features_reduces_dimension(self): + samples = self.dataset.set_task( + DailyAndSportActivitiesClassification( + signal_loader=self.dataset.load_signal, + window_size=50, + stride=25, + selected_features=[0, 1, 2, 3], + ) + ) + + self.assertGreater(len(samples), 0) + self.assertEqual(samples[0]["signal"].shape, (50, 4)) + samples.close() + + def test_task_invalid_feature_index_raises(self): + with self.assertRaises(ValueError): + self.dataset.set_task( + DailyAndSportActivitiesClassification( + signal_loader=self.dataset.load_signal, + window_size=50, + stride=25, + selected_features=[999], + ) + ) + + def test_task_empty_selected_features_raises(self): + with self.assertRaises(ValueError): + self.dataset.set_task( + DailyAndSportActivitiesClassification( + signal_loader=self.dataset.load_signal, + window_size=50, + stride=25, + selected_features=[], + ) + ) + + def test_task_invalid_window_size_raises(self): + with self.assertRaises(ValueError): + DailyAndSportActivitiesClassification( + signal_loader=self.dataset.load_signal, + window_size=0, + stride=25, + ) + + def test_task_invalid_stride_raises(self): + with self.assertRaises(ValueError): + DailyAndSportActivitiesClassification( + signal_loader=self.dataset.load_signal, + window_size=50, + stride=0, + ) + + def test_task_window_larger_than_signal_raises(self): + with self.assertRaises(ValueError): + self.dataset.set_task( + DailyAndSportActivitiesClassification( + signal_loader=self.dataset.load_signal, + window_size=200, + stride=25, + ) + ) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file