sunlabuiuc · niampattni · Apr 20, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -244,5 +244,6 @@ Available Datasets
     datasets/pyhealth.datasets.ClinVarDataset
     datasets/pyhealth.datasets.COSMICDataset
     datasets/pyhealth.datasets.TCGAPRADDataset
+    datasetes/pyhealth.datasets.DailyAndSportActivities
     datasets/pyhealth.datasets.splitter
     datasets/pyhealth.datasets.utils
diff --git a/docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst b/docs/api/datasets/pyhealth.datasets.DailyAndSportActivities.rst
@@ -0,0 +1,9 @@
+pyhealth.datasets.DailyAndSportActivitiesDataset
+========================================
+
+The Daily and Sport Activities dataset. For more information see `here <https://archive.ics.uci.edu/dataset/256/daily+and+sports+activities>`_.
+
+.. autoclass: pyhealth.datasets.DailyAndSportActivitiesDataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/api/tasks.rst b/docs/api/tasks.rst
@@ -230,3 +230,4 @@ Available Tasks
     Mutation Pathogenicity (COSMIC) <tasks/pyhealth.tasks.MutationPathogenicityPrediction>
     Cancer Survival Prediction (TCGA) <tasks/pyhealth.tasks.CancerSurvivalPrediction>
     Cancer Mutation Burden (TCGA) <tasks/pyhealth.tasks.CancerMutationBurden>
+    Daily and Sport Activities <tasks/pyhealth.tasks.DailyAndSportActivities>
diff --git a/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst b/docs/api/tasks/pyhealth.tasks.DailyAndSportActivities.rst
@@ -0,0 +1,7 @@
+pyhealth.tasks.DailyAndSportActivitiesClassification
+===============================================
+
+.. autoclass:: pyhealth.tasks.DailyAndSportActivitiesClassification
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/examples/daily_sport_activities.ipynb b/examples/daily_sport_activities.ipynb
@@ -0,0 +1,358 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "3d4f2a5e",
+   "metadata": {},
+   "source": [
+    "# Daily And Sport Activities RNN Ablation Study\n",
+    "\n",
+    "This notebook demonstrates an ablation study for the Daily and Sports Activities dataset using a built-in PyHealth RNN model.\n",
+    "\n",
+    "## Goal\n",
+    "Evaluate how task configuration affects downstream multiclass activity recognition performance.\n",
+    "\n",
+    "## Ablation configurations\n",
+    "1. `window_size=25, stride=10, normalize=True`\n",
+    "2. `window_size=50, stride=25, normalize=True`\n",
+    "3. `window_size=50, stride=25, normalize=False`\n",
+    "\n",
+    "## Workflow\n",
+    "- Build a small synthetic dataset in the real folder format: `aXX/pY/sZZ.txt`\n",
+    "- Create PyHealth task datasets under multiple task settings\n",
+    "- Train/evaluate a built-in PyHealth RNN\n",
+    "- Compare test accuracy across configurations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ceac2c74",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import tempfile\n",
+    "import random\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "from pyhealth.datasets import DailyAndSportActivitiesDataset\n",
+    "from pyhealth.tasks import DailyAndSportActivitiesClassification\n",
+    "from pyhealth.datasets import split_by_patient, get_dataloader\n",
+    "from pyhealth.models import RNN\n",
+    "from pyhealth.trainer import Trainer\n",
+    "\n",
+    "random.seed(123)\n",
+    "np.random.seed(123)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1c74bc91",
+   "metadata": {},
+   "source": [
+    "## Synthetic data generation\n",
+    "\n",
+    "We create synthetic segment files in the same folder structure as the real dataset:\n",
+    "\n",
+    "- `a01`, `a02`, `a03` for activities\n",
+    "- `p1`, `p2`, ... for subjects\n",
+    "- `s01.txt`, `s02.txt`, ... for segments\n",
+    "\n",
+    "Each file is a `125 x 45` matrix:\n",
+    "- 125 rows = 5 seconds × 25 Hz\n",
+    "- 45 columns = 5 body units × 9 sensor axes\n",
+    "\n",
+    "We inject class-specific structure so the model has a signal to learn."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ceae0a10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _write_fake_signal_file(\n",
+    "    file_path: Path,\n",
+    "    activity_idx: int,\n",
+    "    shape=(125, 45),\n",
+    "    seed: int = 0,\n",
+    ") -> None:\n",
+    "    rng = np.random.default_rng(seed)\n",
+    "    signal = rng.normal(loc=0.0, scale=1.0, size=shape).astype(np.float32)\n",
+    "\n",
+    "    # Inject class-dependent structure\n",
+    "    start_col = activity_idx * 3\n",
+    "    end_col = min(start_col + 3, shape[1])\n",
+    "    signal[:, start_col:end_col] += 2.5\n",
+    "\n",
+    "    # Add a mild temporal trend by class\n",
+    "    t = np.linspace(0, 1, shape[0], dtype=np.float32)[:, None]\n",
+    "    signal[:, start_col:end_col] += (activity_idx + 1) * 0.5 * t\n",
+    "\n",
+    "    file_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "    np.savetxt(file_path, signal, delimiter=\",\", fmt=\"%.6f\")\n",
+    "\n",
+    "\n",
+    "def build_synthetic_dataset(root: Path) -> None:\n",
+    "    activities = [\"a01\", \"a02\", \"a03\"]\n",
+    "    subjects = [f\"p{i}\" for i in range(1, 7)]\n",
+    "    segments_per_subject = 4\n",
+    "\n",
+    "    seed = 0\n",
+    "    for activity_idx, activity in enumerate(activities):\n",
+    "        for subject in subjects:\n",
+    "            for seg in range(1, segments_per_subject + 1):\n",
+    "                file_path = root / activity / subject / f\"s{seg:02d}.txt\"\n",
+    "                _write_fake_signal_file(\n",
+    "                    file_path=file_path,\n",
+    "                    activity_idx=activity_idx,\n",
+    "                    seed=seed,\n",
+    "                )\n",
+    "                seed += 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6df68140",
+   "metadata": {},
+   "source": [
+    "## Build the synthetic dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3b023a0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmpdir = tempfile.TemporaryDirectory()\n",
+    "root = Path(tmpdir.name) / \"daily_sport_activities\"\n",
+    "build_synthetic_dataset(root)\n",
+    "\n",
+    "root"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08e695fe",
+   "metadata": {},
+   "source": [
+    "## Define one ablation run\n",
+    "\n",
+    "For each configuration:\n",
+    "- build a task-specific sample dataset\n",
+    "- split by patient\n",
+    "- create dataloaders\n",
+    "- train a built-in PyHealth RNN\n",
+    "- evaluate on the test set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "227158ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_one_config(root: Path, cfg: dict) -> dict:\n",
+    "    dataset = DailyAndSportActivitiesDataset(root=str(root))\n",
+    "    task = DailyAndSportActivitiesClassification(signal_loader=dataset.load_signal, **cfg)\n",
+    "\n",
+    "    sample_dataset = dataset.set_task(task)\n",
+    "\n",
+    "    train_ds, val_ds, test_ds = split_by_patient(sample_dataset, [0.6, 0.2, 0.2])\n",
+    "\n",
+    "    train_loader = get_dataloader(train_ds, batch_size=16, shuffle=True)\n",
+    "    val_loader = get_dataloader(val_ds, batch_size=16, shuffle=False)\n",
+    "    test_loader = get_dataloader(test_ds, batch_size=16, shuffle=False)\n",
+    "\n",
+    "    model = RNN(\n",
+    "        dataset=sample_dataset,\n",
+    "        embedding_dim=128,\n",
+    "        hidden_dim=64,\n",
+    "        rnn_type=\"GRU\",\n",
+    "        num_layers=1,\n",
+    "        dropout=0.1,\n",
+    "    )\n",
+    "\n",
+    "    trainer = Trainer(\n",
+    "        model=model,\n",
+    "        metrics=[\"accuracy\"],\n",
+    "        device=\"cpu\",\n",
+    "        enable_logging=False,\n",
+    "    )\n",
+    "\n",
+    "    trainer.train(\n",
+    "        train_dataloader=train_loader,\n",
+    "        val_dataloader=val_loader,\n",
+    "        epochs=5,\n",
+    "        monitor=\"accuracy\",\n",
+    "        monitor_criterion=\"max\",\n",
+    "    )\n",
+    "\n",
+    "    scores = trainer.evaluate(test_loader)\n",
+    "\n",
+    "    return {\n",
+    "        \"config\": cfg,\n",
+    "        \"n_total_samples\": len(sample_dataset),\n",
+    "        \"n_train\": len(train_ds),\n",
+    "        \"n_val\": len(val_ds),\n",
+    "        \"n_test\": len(test_ds),\n",
+    "        \"accuracy\": float(scores[\"accuracy\"]),\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa33d665",
+   "metadata": {},
+   "source": [
+    "## Define ablation settings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7fca5285",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "configs = [\n",
+    "    {\"window_size\": 25, \"stride\": 10, \"normalize\": True},\n",
+    "    {\"window_size\": 50, \"stride\": 25, \"normalize\": True},\n",
+    "    {\"window_size\": 50, \"stride\": 25, \"normalize\": False},\n",
+    "]\n",
+    "\n",
+    "configs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8a3d3843",
+   "metadata": {},
+   "source": [
+    "## Run the ablation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c5784e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = []\n",
+    "\n",
+    "for cfg in configs:\n",
+    "    print(f\"Running config: {cfg}\")\n",
+    "    result = run_one_config(root, cfg)\n",
+    "    results.append(result)\n",
+    "    print(result)\n",
+    "    print(\"-\" * 80)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d36015e3",
+   "metadata": {},
+   "source": [
+    "## Summarize results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "df67cb3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "pd.set_option(\"display.max_colwidth\", None)\n",
+    "\n",
+    "results_df = pd.DataFrame(results)\n",
+    "results_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3d194c27",
+   "metadata": {},
+   "source": [
+    "## Sort by accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89fe3ef7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results_df.sort_values(\"accuracy\", ascending=False).reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eb351598",
+   "metadata": {},
+   "source": [
+    "## Brief interpretation\n",
+    "\n",
+    "- Larger windows (50) outperform smaller windows (25), likely due to better temporal context.\n",
+    "- Normalization **decreased** performance in this setup, suggesting the model may already handle raw scale well or synthetic signal structure was distorted.\n",
+    "- Smaller windows increase sample count but may reduce per-sample information, hurting performance.\n",
+    "- Note: Results are based on synthetic data; trends illustrate pipeline behavior rather than real-world performance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b5a3c92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_row = results_df.sort_values(\"accuracy\", ascending=False).iloc[0]\n",
+    "\n",
+    "print(\"Best configuration:\")\n",
+    "print(best_row[\"config\"])\n",
+    "print(f\"Accuracy: {best_row['accuracy']:.4f}\")\n",
+    "print(f\"Total task samples: {best_row['n_total_samples']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a4b2cfc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmpdir.cleanup()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pyhealth-project",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py
@@ -68,6 +68,7 @@ def __init__(self, *args, **kwargs):
 from .bmd_hs import BMDHSDataset
 from .support2 import Support2Dataset
 from .tcga_prad import TCGAPRADDataset
+from .daily_sport_activities import DailyAndSportActivitiesDataset
 from .splitter import (
     sample_balanced,
     split_by_patient,