From c91783707d611c6c86bc6f4ae6a00a011b0658ec Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 16 Mar 2026 10:09:21 +0000
Subject: [PATCH 1/2] Initial plan


From fa6bee382e449437487cbe03c5d2114d11453a88 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 16 Mar 2026 10:24:40 +0000
Subject: [PATCH 2/2] Implement all empty modules, add tests, fix config files

Co-authored-by: ansuff <63700848+ansuff@users.noreply.github.com>
---
 .pre-commit-config.yaml        |   2 +-
 README.md                      |  93 ++++++++++--
 init_setup.sh                  |  16 ---
 pyproject.toml                 |   8 +-
 settings.toml                  |   5 +
 src/awai/__init__.py           |   1 +
 src/awai/entrypoint.py         |  58 ++++++++
 src/awai/models/__init__.py    |   5 +
 src/awai/models/schemas.py     |  22 +++
 src/awai/tasks/__init__.py     |   6 +
 src/awai/tasks/load_data.py    |  62 ++++++++
 src/awai/tasks/prepare_data.py |  62 ++++++++
 src/awai/utils/__init__.py     |  27 ++++
 src/awai/utils/data_cleaner.py | 251 +++++++++++++++++++++++++++++++++
 src/awai/utils/xml_parser.py   |  87 ++++++++++++
 tests/__init__.py              |   0
 tests/test_data_cleaner.py     | 208 +++++++++++++++++++++++++++
 tests/test_xml_parser.py       |  95 +++++++++++++
 18 files changed, 975 insertions(+), 33 deletions(-)
 create mode 100644 src/awai/models/schemas.py
 create mode 100644 src/awai/tasks/load_data.py
 create mode 100644 src/awai/tasks/prepare_data.py
 create mode 100644 src/awai/utils/data_cleaner.py
 create mode 100644 src/awai/utils/xml_parser.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_data_cleaner.py
 create mode 100644 tests/test_xml_parser.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 96a6bec..22410fb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 default_language_version:
   python: python3
-files: '^src/.*\.pyi$|^tests/.\.pyi?$'
+files: '^src/.*\.pyi?$|^tests/.*\.pyi?$'
 fail_fast: true
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/README.md b/README.md
index e494c32..89abc80 100644
--- a/README.md
+++ b/README.md
@@ -1,33 +1,96 @@
 # Apple Watch Fitness AI
-A small project to create a fitness AI for the Apple Watch.
 
-The goal is to see if I can predict which time of the day is best to do a workout based on the user's heart rate, sleep, and activity data. Later on, I will try to predict the type of workout that the user should do based on the same data.
+A data-science project that predicts optimal workout timing from Apple Watch health data.
 
-Also, I will try to predict the best day of the week to do a workout based on the same data.
+## Goals
 
-More to come...
+- **Phase 1** – Predict the best *time of day* for a workout (heart rate, sleep, activity data).
+- **Phase 2** – Predict the best *type* of workout.
+- **Phase 3** – Predict the best *day of the week* for a workout.
 
-## Setup Instructions
+## Project structure
 
-To get started with this project, you can use the `init_setup.sh` script to install all necessary dependencies. This script will:
+```
+AppleWatchAI/
+├── src/awai/
+│   ├── entrypoint.py        # CLI entry point (fire-based)
+│   ├── utils/
+│   │   ├── xml_parser.py    # Parse Apple Watch XML exports
+│   │   └── data_cleaner.py  # Cleaning, filtering & aggregation helpers
+│   ├── models/
+│   │   └── schemas.py       # Pandera validation schemas
+│   └── tasks/
+│       ├── load_data.py     # XML → DuckDB load task
+│       └── prepare_data.py  # Full clean/filter/aggregate pipeline
+├── notebooks/exploratory/   # Jupytext-managed EDA notebooks
+├── tests/                   # pytest test suite
+├── data/                    # Apple Watch export files (git-ignored)
+└── settings.toml            # Default configuration (Dynaconf)
+```
+
+## Setup
+
+### Prerequisites
 
-- Check if Homebrew is installed and install it if necessary.
-- Install Poetry using Homebrew.
-- Install npm using Homebrew.
-- Install nodemon globally using npm.
+- Python 3.11 or later
+- [Poetry](https://python-poetry.org/) (install via `pip install poetry` or `brew install poetry`)
 
-To run the setup script, use the following command in your terminal:
+### macOS quick-start
 
 ```sh
-./init_setup.sh
+./init_setup.sh   # installs Homebrew & Poetry if missing
+poetry install    # installs all Python dependencies
 ```
 
-After running the setup script, you need to install the project dependencies using Poetry. Run the following command in your terminal:
+### Other platforms
 
 ```sh
+pip install poetry
 poetry install
 ```
 
-This will install all the dependencies specified in the pyproject.toml file.
+## Usage
+
+Export your Apple Health data from the iPhone Health app (*Profile → Export All Health Data*) and place the resulting `export.xml` inside a `data/` folder at the project root.
+
+### Load the XML export into DuckDB
+
+```sh
+poetry run awai load --xml_path=data/export.xml --db_path=data/health_data.duckdb
+```
+
+### Run the data-preparation pipeline
+
+```sh
+poetry run awai prepare --db_path=data/health_data.duckdb
+```
+
+### Explore interactively
+
+The `notebooks/exploratory/EDA.py` notebook contains the original exploratory analysis.  
+To convert it to a Jupyter notebook and open it:
+
+```sh
+poetry run jupytext --sync notebooks/exploratory/EDA.py
+poetry run jupyter lab notebooks/exploratory/EDA.ipynb
+```
+
+## Running tests
+
+```sh
+poetry run pytest
+```
+
+## Configuration
+
+Default settings live in `settings.toml`.  Override any value with an environment variable prefixed with `DYNACONF_`, e.g.:
+
+```sh
+export DYNACONF_DATA_DIR=/path/to/my/data
+```
+
+## Contributing
 
-You are now ready to start working on the project! 🚀
\ No newline at end of file
+1. Install dev dependencies: `poetry install`
+2. Install pre-commit hooks: `poetry run pre-commit install`
+3. Run the test suite: `poetry run pytest`
diff --git a/init_setup.sh b/init_setup.sh
index f968e2a..ee3c78b 100644
--- a/init_setup.sh
+++ b/init_setup.sh
@@ -21,20 +21,4 @@ else
     echo "Poetry is already installed."
 fi
 
-# Install npm using Homebrew
-if ! command_exists npm; then
-    echo "Installing npm..."
-    brew install npm
-else
-    echo "npm is already installed."
-fi
-
-# Install nodemon globally using npm
-if ! command_exists nodemon; then
-    echo "Installing nodemon globally..."
-    npm install -g nodemon
-else
-    echo "nodemon is already installed."
-fi
-
 echo "Setup complete."
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index c7e0a7c..2ddac4a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,12 @@ formats = "ipynb,py:percent"
 [tool.ruff.lint]
 extend-select = ["I"]
 
+[tool.poetry.scripts]
+awai = "awai.entrypoint:main"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
 [build-system]
-requires = ["poetry-core==1.8.2"]
+requires = ["poetry-core>=1.9.0"]
 build-backend = "poetry.core.masonry.api"
diff --git a/settings.toml b/settings.toml
index e69de29..532124f 100644
--- a/settings.toml
+++ b/settings.toml
@@ -0,0 +1,5 @@
+[default]
+data_dir = "data"
+xml_file_name = "export.xml"
+db_name = "health_data.duckdb"
+log_level = "INFO"
diff --git a/src/awai/__init__.py b/src/awai/__init__.py
index e69de29..a430591 100644
--- a/src/awai/__init__.py
+++ b/src/awai/__init__.py
@@ -0,0 +1 @@
+"""Apple Watch AI – workout timing prediction from Apple Health data."""
diff --git a/src/awai/entrypoint.py b/src/awai/entrypoint.py
index e69de29..29b4362 100644
--- a/src/awai/entrypoint.py
+++ b/src/awai/entrypoint.py
@@ -0,0 +1,58 @@
+"""CLI entry point for Apple Watch AI."""
+
+from pathlib import Path
+
+import fire
+from loguru import logger
+
+from awai.tasks.load_data import load_to_duckdb
+from awai.tasks.prepare_data import prepare_records
+
+
+class CLI:
+    """Apple Watch AI command-line interface.
+
+    Available commands::
+
+        awai load     – parse export.xml and store data in DuckDB
+        awai prepare  – clean, filter, and aggregate the stored data
+    """
+
+    def load(
+        self,
+        xml_path: str = "data/export.xml",
+        db_path: str = "data/health_data.duckdb",
+    ) -> None:
+        """Parse an Apple Watch XML export and load it into a DuckDB database.
+
+        Args:
+            xml_path: Path to the Apple Watch ``export.xml`` file.
+            db_path:  Path where the DuckDB database will be created.
+        """
+        load_to_duckdb(Path(xml_path), Path(db_path))
+        logger.info("Load complete.")
+
+    def prepare(
+        self,
+        db_path: str = "data/health_data.duckdb",
+    ) -> None:
+        """Run the data-preparation pipeline (clean, filter, aggregate).
+
+        Args:
+            db_path: Path to the DuckDB database created by the ``load`` command.
+        """
+        by_type, daily, monthly = prepare_records(Path(db_path))
+        logger.info(
+            f"Prepared {len(by_type)} record types, "
+            f"{sum(len(v) for v in daily.values()):,} daily rows, "
+            f"{sum(len(v) for v in monthly.values()):,} monthly rows."
+        )
+
+
+def main() -> None:
+    """Fire-based CLI entry point."""
+    fire.Fire(CLI)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/awai/models/__init__.py b/src/awai/models/__init__.py
index e69de29..39b5cf2 100644
--- a/src/awai/models/__init__.py
+++ b/src/awai/models/__init__.py
@@ -0,0 +1,5 @@
+"""Data models and validation schemas."""
+
+from awai.models.schemas import RecordsSchema
+
+__all__ = ["RecordsSchema"]
diff --git a/src/awai/models/schemas.py b/src/awai/models/schemas.py
new file mode 100644
index 0000000..8d7c6a4
--- /dev/null
+++ b/src/awai/models/schemas.py
@@ -0,0 +1,22 @@
+"""Pandera schemas for validating Apple Watch health data DataFrames."""
+
+import pandera as pa
+from pandera import Column, DataFrameSchema
+
+#: Schema for the cleaned health records DataFrame produced by
+#: :func:`~awai.utils.data_cleaner.clean_records`.
+RecordsSchema = DataFrameSchema(
+    {
+        "type": Column(str, nullable=False),
+        "Date": Column(
+            str,
+            pa.Check.str_matches(r"^\d{4}-\d{2}-\d{2}$"),
+            nullable=False,
+        ),
+        "Day": Column(str, nullable=False),
+        "Month": Column(str, nullable=False),
+        "value": Column(float, pa.Check.ge(0), nullable=False),
+        "unit": Column(str, nullable=True),
+    },
+    coerce=True,
+)
diff --git a/src/awai/tasks/__init__.py b/src/awai/tasks/__init__.py
index e69de29..a02d07b 100644
--- a/src/awai/tasks/__init__.py
+++ b/src/awai/tasks/__init__.py
@@ -0,0 +1,6 @@
+"""Task modules for data loading and preparation."""
+
+from awai.tasks.load_data import load_to_duckdb
+from awai.tasks.prepare_data import prepare_records
+
+__all__ = ["load_to_duckdb", "prepare_records"]
diff --git a/src/awai/tasks/load_data.py b/src/awai/tasks/load_data.py
new file mode 100644
index 0000000..b4193ae
--- /dev/null
+++ b/src/awai/tasks/load_data.py
@@ -0,0 +1,62 @@
+"""Task: load an Apple Watch XML export into a DuckDB database."""
+
+from pathlib import Path
+
+import duckdb
+from loguru import logger
+
+from awai.utils.xml_parser import (
+    extract_activity_summaries,
+    extract_records,
+    extract_workouts,
+    load_xml_export,
+)
+
+
+def load_to_duckdb(xml_path: Path, db_path: Path) -> None:
+    """Parse the Apple Watch XML export and persist data into DuckDB.
+
+    Three tables are created (if they do not already exist):
+
+    * ``records``    – time-series health measurements.
+    * ``workouts``   – individual workout sessions (flattened).
+    * ``activities`` – daily activity-ring summaries.
+
+    If all three tables are already present in *db_path* the function exits
+    early without re-parsing the XML file.
+
+    Args:
+        xml_path: Path to the Apple Watch ``export.xml`` file.
+        db_path:  Path where the DuckDB database will be created or opened.
+    """
+    xml_path = Path(xml_path)
+    db_path = Path(db_path)
+
+    con = duckdb.connect(str(db_path))
+    try:
+        already_loaded = (
+            con.execute(
+                "SELECT COUNT(*) FROM information_schema.tables "
+                "WHERE table_schema = 'main' "
+                "AND table_name IN ('records', 'workouts', 'activities')"
+            ).fetchone()[0]
+            >= 3
+        )
+        if already_loaded:
+            logger.info("All tables already exist in DuckDB – skipping load.")
+            return
+
+        logger.info(f"Parsing XML export from {xml_path} …")
+        health_data = load_xml_export(xml_path)
+
+        records_df = extract_records(health_data)
+        workout_df_flat = extract_workouts(health_data)
+        activity_df = extract_activity_summaries(health_data)
+
+        con.execute("CREATE TABLE records AS SELECT * FROM records_df")
+        con.execute("CREATE TABLE workouts AS SELECT * FROM workout_df_flat")
+        con.execute("CREATE TABLE activities AS SELECT * FROM activity_df")
+
+        logger.info("Data successfully loaded into DuckDB.")
+    finally:
+        con.close()
diff --git a/src/awai/tasks/prepare_data.py b/src/awai/tasks/prepare_data.py
new file mode 100644
index 0000000..6263fb7
--- /dev/null
+++ b/src/awai/tasks/prepare_data.py
@@ -0,0 +1,62 @@
+"""Task: prepare cleaned health data for downstream analysis and ML."""
+
+from pathlib import Path
+
+import duckdb
+import pandas as pd
+from loguru import logger
+
+from awai.utils.data_cleaner import (
+    aggregate_daily,
+    aggregate_monthly,
+    clean_records,
+    filter_record_types,
+)
+
+
+def load_records_from_db(db_path: Path) -> pd.DataFrame:
+    """Read the raw ``records`` table from a DuckDB database.
+
+    Args:
+        db_path: Path to the DuckDB database (created by :mod:`~awai.tasks.load_data`).
+
+    Returns:
+        Raw records :class:`~pandas.DataFrame` as stored in the database.
+    """
+    db_path = Path(db_path)
+    con = duckdb.connect(str(db_path), read_only=True)
+    try:
+        df = con.query("SELECT * FROM records").to_df()
+    finally:
+        con.close()
+    logger.info(f"Loaded {len(df):,} records from {db_path}")
+    return df
+
+
+def prepare_records(
+    db_path: Path,
+) -> tuple[dict[str, pd.DataFrame], dict[str, pd.DataFrame], dict[str, pd.DataFrame]]:
+    """Run the full data-preparation pipeline for health records.
+
+    Pipeline steps:
+
+    1. Load raw records from DuckDB.
+    2. Clean and normalise the DataFrame (:func:`~awai.utils.data_cleaner.clean_records`).
+    3. Split into per-type DataFrames (:func:`~awai.utils.data_cleaner.filter_record_types`).
+    4. Compute daily aggregations (:func:`~awai.utils.data_cleaner.aggregate_daily`).
+    5. Compute monthly aggregations (:func:`~awai.utils.data_cleaner.aggregate_monthly`).
+
+    Args:
+        db_path: Path to the DuckDB database.
+
+    Returns:
+        A 3-tuple ``(records_by_type, daily, monthly)`` where each element is
+        a ``dict`` mapping record-type names to :class:`~pandas.DataFrame` objects.
+    """
+    raw_df = load_records_from_db(db_path)
+    cleaned = clean_records(raw_df)
+    by_type = filter_record_types(cleaned)
+    daily = aggregate_daily(by_type)
+    monthly = aggregate_monthly(by_type)
+    logger.info("Data preparation complete.")
+    return by_type, daily, monthly
diff --git a/src/awai/utils/__init__.py b/src/awai/utils/__init__.py
index e69de29..e009f3a 100644
--- a/src/awai/utils/__init__.py
+++ b/src/awai/utils/__init__.py
@@ -0,0 +1,27 @@
+"""Utility modules for Apple Watch AI."""
+
+from awai.utils.data_cleaner import (
+    aggregate_daily,
+    aggregate_monthly,
+    camel_to_snake,
+    clean_records,
+    filter_record_types,
+)
+from awai.utils.xml_parser import (
+    extract_activity_summaries,
+    extract_records,
+    extract_workouts,
+    load_xml_export,
+)
+
+__all__ = [
+    "camel_to_snake",
+    "clean_records",
+    "filter_record_types",
+    "aggregate_daily",
+    "aggregate_monthly",
+    "load_xml_export",
+    "extract_records",
+    "extract_workouts",
+    "extract_activity_summaries",
+]
diff --git a/src/awai/utils/data_cleaner.py b/src/awai/utils/data_cleaner.py
new file mode 100644
index 0000000..2913dc8
--- /dev/null
+++ b/src/awai/utils/data_cleaner.py
@@ -0,0 +1,251 @@
+"""Data cleaning and aggregation utilities for Apple Watch health data."""
+
+import re
+
+import pandas as pd
+from loguru import logger
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+#: Health record types that are relevant to workout-timing analysis.
+RECORD_TYPES: list[str] = [
+    "BodyMass",
+    "ActiveEnergyBurned",
+    "BasalEnergyBurned",
+    "DistanceWalkingRunning",
+    "StepCount",
+    "AppleStandTime",
+    "WalkingSpeed",
+    "DistanceCycling",
+    "HeartRateVariabilitySDNN",
+    "RestingHeartRate",
+    "WalkingHeartRateAverage",
+    "VO2Max",
+    "HeartRateRecoveryOneMinute",
+    "PhysicalEffort",
+    "SleepAnalysis",
+]
+
+#: Columns dropped during cleaning (metadata that is not useful for analysis).
+COLUMNS_TO_DROP: list[str] = [
+    "source_name",
+    "source_version",
+    "device",
+    "creation_date",
+    "end_date",
+    "metadata_entry",
+    "heart_rate_variability_metadata_list",
+]
+
+#: Record types whose daily values should be *summed* (rather than averaged).
+DAILY_SUM_KEYS: list[str] = [
+    "BasalEnergyBurned",
+    "ActiveEnergyBurned",
+    "DistanceWalkingRunning",
+    "StepCount",
+    "AppleStandTime",
+    "DistanceCycling",
+    "PhysicalEffort",
+]
+
+# ---------------------------------------------------------------------------
+# Low-level helpers
+# ---------------------------------------------------------------------------
+
+
+def camel_to_snake(name: str) -> str:
+    """Convert a camelCase or ``@camelCase`` column name to ``snake_case``.
+
+    Args:
+        name: The original column name (may start with ``@``).
+
+    Returns:
+        The snake_case equivalent.
+
+    Examples:
+        >>> camel_to_snake("startDate")
+        'start_date'
+        >>> camel_to_snake("@type")
+        'type'
+    """
+    name = re.sub("@", "", name)
+    name = re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower()
+    return name
+
+
+def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """Rename all DataFrame columns from camelCase / ``@camelCase`` to snake_case.
+
+    Args:
+        df: Input DataFrame.
+
+    Returns:
+        A copy of *df* with renamed columns.
+    """
+    df = df.copy()
+    df.columns = [camel_to_snake(col) for col in df.columns]
+    return df
+
+
+# ---------------------------------------------------------------------------
+# Records cleaning
+# ---------------------------------------------------------------------------
+
+
+def clean_records(df: pd.DataFrame) -> pd.DataFrame:
+    """Clean and transform the raw health records DataFrame.
+
+    Steps performed:
+
+    1. Rename columns to ``snake_case``.
+    2. Parse ``start_date`` / ``end_date`` and compute ``duration``.
+    3. Drop metadata columns (see :data:`COLUMNS_TO_DROP`).
+    4. Add ``Day``, ``Date``, and ``Month`` helper columns from ``start_date``.
+    5. Coerce ``value`` to :class:`float` (fill non-numeric rows with ``1.0``).
+    6. Strip ``HKQuantityTypeIdentifier`` / ``HKCategoryTypeIdentifier`` prefixes
+       from the ``type`` column.
+
+    Args:
+        df: Raw records DataFrame as produced by
+            :func:`~awai.utils.xml_parser.extract_records`.
+
+    Returns:
+        Cleaned DataFrame ready for further analysis.
+    """
+    df = rename_columns(df)
+
+    # Parse dates and compute duration before any columns are dropped.
+    for col in ("start_date", "end_date"):
+        if col in df.columns:
+            df[col] = pd.to_datetime(df[col])
+    if "start_date" in df.columns and "end_date" in df.columns:
+        df["duration"] = df["end_date"] - df["start_date"]
+
+    # Drop metadata columns that are present in this export.
+    existing_drops = [c for c in COLUMNS_TO_DROP if c in df.columns]
+    df = df.drop(columns=existing_drops)
+
+    # Add calendar helper columns derived from start_date.
+    if "start_date" in df.columns:
+        df["Day"] = df["start_date"].dt.strftime("%A")
+        df["Date"] = df["start_date"].dt.strftime("%Y-%m-%d")
+        df["Month"] = df["start_date"].dt.strftime("%B")
+
+    # Coerce value to float; records that have no numeric value (e.g. presence
+    # indicators such as SleepAnalysis) are assigned 1.0 so they count as one
+    # occurrence and can still be summed/aggregated meaningfully.
+    if "value" in df.columns:
+        df["value"] = pd.to_numeric(df["value"], errors="coerce").fillna(1.0).astype(float)
+
+    # Shorten Apple's verbose type identifiers.
+    if "type" in df.columns:
+        df["type"] = (
+            df["type"]
+            .str.replace("HKQuantityTypeIdentifier", "", regex=False)
+            .str.replace("HKCategoryTypeIdentifier", "", regex=False)
+        )
+
+    # Reorder to a canonical column order; any extra columns go at the end.
+    desired = ["type", "Date", "Day", "Month", "value", "unit", "duration"]
+    available = [c for c in desired if c in df.columns]
+    remaining = [c for c in df.columns if c not in desired]
+    df = df[available + remaining]
+
+    logger.info(f"Cleaned records: {len(df):,} rows, {len(df.columns)} columns")
+    return df
+
+
+# ---------------------------------------------------------------------------
+# Filtering & aggregation
+# ---------------------------------------------------------------------------
+
+
+def filter_record_types(
+    df: pd.DataFrame,
+    record_types: list[str] | None = None,
+) -> dict[str, pd.DataFrame]:
+    """Split a cleaned records DataFrame into one sub-DataFrame per health type.
+
+    Args:
+        df: Cleaned records DataFrame (output of :func:`clean_records`).
+        record_types: Types to extract.  Defaults to :data:`RECORD_TYPES`.
+
+    Returns:
+        A ``dict`` mapping each record type name to a filtered and renamed
+        DataFrame where the ``value`` column is renamed to the type name.
+    """
+    if record_types is None:
+        record_types = RECORD_TYPES
+
+    result: dict[str, pd.DataFrame] = {}
+    for rt in record_types:
+        mask = df["type"].str.contains(rt, regex=False)
+        subset = df.loc[mask].rename(columns={"value": rt}).sort_values("Date")
+        result[rt] = subset
+
+    logger.info(f"Filtered into {len(result)} record type groups")
+    return result
+
+
+def aggregate_daily(
+    records_by_type: dict[str, pd.DataFrame],
+    keys: list[str] | None = None,
+) -> dict[str, pd.DataFrame]:
+    """Aggregate per-type DataFrames to *daily* totals (sum).
+
+    Args:
+        records_by_type: Output of :func:`filter_record_types`.
+        keys: Types to aggregate.  Defaults to :data:`DAILY_SUM_KEYS`.
+
+    Returns:
+        A ``dict`` mapping each type name to a daily-aggregated DataFrame.
+    """
+    if keys is None:
+        keys = DAILY_SUM_KEYS
+
+    daily: dict[str, pd.DataFrame] = {}
+    for key in keys:
+        if key not in records_by_type:
+            logger.warning(f"Key '{key}' not found in records – skipping daily aggregation")
+            continue
+        df = records_by_type[key]
+        daily[key] = (
+            df.groupby("Date")
+            .agg({key: "sum", "Day": lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0]})
+            .reset_index()
+        )
+
+    return daily
+
+
+def aggregate_monthly(
+    records_by_type: dict[str, pd.DataFrame],
+    keys: list[str] | None = None,
+) -> dict[str, pd.DataFrame]:
+    """Aggregate per-type DataFrames to *monthly* totals (sum).
+
+    Args:
+        records_by_type: Output of :func:`filter_record_types`.
+        keys: Types to aggregate.  Defaults to :data:`DAILY_SUM_KEYS`.
+
+    Returns:
+        A ``dict`` mapping each type name to a monthly-aggregated DataFrame.
+    """
+    if keys is None:
+        keys = DAILY_SUM_KEYS
+
+    monthly: dict[str, pd.DataFrame] = {}
+    for key in keys:
+        if key not in records_by_type:
+            logger.warning(f"Key '{key}' not found in records – skipping monthly aggregation")
+            continue
+        df = records_by_type[key]
+        monthly[key] = (
+            df.groupby(df["Date"].str[:-3])
+            .agg({key: "sum", "Month": lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0]})
+            .reset_index()
+        )
+
+    return monthly
diff --git a/src/awai/utils/xml_parser.py b/src/awai/utils/xml_parser.py
new file mode 100644
index 0000000..c35af8d
--- /dev/null
+++ b/src/awai/utils/xml_parser.py
@@ -0,0 +1,87 @@
+"""Utilities for parsing Apple Watch XML health data exports."""
+
+from pathlib import Path
+
+import pandas as pd
+import xmltodict
+from loguru import logger
+
+
+def load_xml_export(xml_path: Path) -> dict:
+    """Load an Apple Watch health export XML file and return the parsed dict.
+
+    Args:
+        xml_path: Path to the Apple Watch ``export.xml`` file.
+
+    Returns:
+        A nested dictionary produced by ``xmltodict.parse``.
+    """
+    xml_path = Path(xml_path)
+    logger.info(f"Loading XML export from {xml_path}")
+    with open(xml_path, "r", encoding="utf-8") as fh:
+        return xmltodict.parse(fh.read())
+
+
+def _ensure_list(value: dict | list) -> list:
+    """Ensure *value* is a list, wrapping a single dict in one if necessary.
+
+    ``xmltodict`` returns a ``dict`` (rather than a one-element ``list``) when
+    there is only a single child element in the XML.  This helper normalises
+    both cases so callers can always iterate over a list.
+    """
+    return value if isinstance(value, list) else [value]
+
+
+def extract_records(health_data: dict) -> pd.DataFrame:
+    """Extract health records from the parsed XML dictionary.
+
+    Records contain time-series measurements such as heart rate, step count,
+    active energy burned, and many other health metrics.
+
+    Args:
+        health_data: Parsed XML dictionary (returned by :func:`load_xml_export`).
+
+    Returns:
+        A :class:`~pandas.DataFrame` with one row per health record.
+    """
+    records_list = _ensure_list(health_data["HealthData"]["Record"])
+    df = pd.DataFrame(records_list)
+    logger.info(f"Extracted {len(df)} health records")
+    return df
+
+
+def extract_workouts(health_data: dict) -> pd.DataFrame:
+    """Extract workout data from the parsed XML and flatten nested structures.
+
+    Apple Watch workout entries can contain nested metadata which is flattened
+    using :func:`pandas.json_normalize`.
+
+    Args:
+        health_data: Parsed XML dictionary (returned by :func:`load_xml_export`).
+
+    Returns:
+        A flat :class:`~pandas.DataFrame` with one row per workout.
+    """
+    workouts_list = _ensure_list(health_data["HealthData"]["Workout"])
+    workout_df = pd.DataFrame(workouts_list)
+    df = pd.json_normalize(workout_df.to_dict(orient="records"))
+    logger.info(f"Extracted {len(df)} workouts")
+    return df
+
+
+def extract_activity_summaries(health_data: dict) -> pd.DataFrame:
+    """Extract daily activity summaries (rings data) from the parsed XML.
+
+    Activity summaries capture the three Apple Watch activity rings: active
+    energy burned, exercise minutes, and stand hours – per calendar day.
+
+    Args:
+        health_data: Parsed XML dictionary (returned by :func:`load_xml_export`).
+
+    Returns:
+        A :class:`~pandas.DataFrame` with one row per day.
+    """
+    activity_list = _ensure_list(health_data["HealthData"]["ActivitySummary"])
+    df = pd.DataFrame(activity_list)
+    logger.info(f"Extracted {len(df)} activity summaries")
+    return df
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_data_cleaner.py b/tests/test_data_cleaner.py
new file mode 100644
index 0000000..ec67518
--- /dev/null
+++ b/tests/test_data_cleaner.py
@@ -0,0 +1,208 @@
+"""Tests for the data cleaning and aggregation utilities."""
+
+import pandas as pd
+import pytest
+
+from awai.utils.data_cleaner import (
+    DAILY_SUM_KEYS,
+    RECORD_TYPES,
+    aggregate_daily,
+    aggregate_monthly,
+    camel_to_snake,
+    clean_records,
+    filter_record_types,
+    rename_columns,
+)
+
+# ---------------------------------------------------------------------------
+# camel_to_snake
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "input_name, expected",
+    [
+        ("camelCase", "camel_case"),
+        ("@type", "type"),
+        ("startDate", "start_date"),
+        ("VO2Max", "v_o2_max"),
+        ("snake_case", "snake_case"),
+        ("ActiveEnergyBurned", "active_energy_burned"),
+    ],
+)
+def test_camel_to_snake(input_name: str, expected: str) -> None:
+    assert camel_to_snake(input_name) == expected
+
+
+# ---------------------------------------------------------------------------
+# rename_columns
+# ---------------------------------------------------------------------------
+
+
+def test_rename_columns() -> None:
+    df = pd.DataFrame(columns=["@type", "startDate", "endDate"])
+    renamed = rename_columns(df)
+    assert list(renamed.columns) == ["type", "start_date", "end_date"]
+
+
+def test_rename_columns_does_not_mutate_original() -> None:
+    df = pd.DataFrame(columns=["@type", "startDate"])
+    rename_columns(df)
+    assert "@type" in df.columns  # original is unchanged
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+def _make_raw_records(n: int = 5) -> pd.DataFrame:
+    """Return a small synthetic raw-records DataFrame (mimics xmltodict output)."""
+    rows = [
+        {
+            "@type": "HKQuantityTypeIdentifierActiveEnergyBurned",
+            "startDate": "2024-01-01 08:00:00 +0000",
+            "endDate": "2024-01-01 08:30:00 +0000",
+            "creationDate": "2024-01-01 08:30:00 +0000",
+            "value": str(i * 50),
+            "unit": "kcal",
+            "sourceName": "Apple Watch",
+        }
+        for i in range(1, n + 1)
+    ]
+    return pd.DataFrame(rows)
+
+
+def _make_typed_records() -> dict[str, pd.DataFrame]:
+    """Return a minimal per-type dict suitable for aggregation tests."""
+    df = pd.DataFrame(
+        {
+            "ActiveEnergyBurned": [100.0, 200.0, 150.0],
+            "Date": ["2024-01-01", "2024-01-01", "2024-01-02"],
+            "Day": ["Monday", "Monday", "Tuesday"],
+            "Month": ["January", "January", "January"],
+        }
+    )
+    return {"ActiveEnergyBurned": df}
+
+
+# ---------------------------------------------------------------------------
+# clean_records
+# ---------------------------------------------------------------------------
+
+
+def test_clean_records_expected_columns() -> None:
+    raw = _make_raw_records()
+    cleaned = clean_records(raw)
+    for col in ("type", "Date", "Day", "Month", "value", "unit", "duration"):
+        assert col in cleaned.columns, f"Expected column '{col}' missing"
+
+
+def test_clean_records_type_prefix_stripped() -> None:
+    raw = _make_raw_records()
+    cleaned = clean_records(raw)
+    assert not cleaned["type"].str.startswith("HKQuantityTypeIdentifier").any()
+    assert not cleaned["type"].str.startswith("HKCategoryTypeIdentifier").any()
+
+
+def test_clean_records_value_is_float() -> None:
+    raw = _make_raw_records()
+    cleaned = clean_records(raw)
+    assert pd.api.types.is_float_dtype(cleaned["value"])
+
+
+def test_clean_records_duration_is_timedelta() -> None:
+    raw = _make_raw_records()
+    cleaned = clean_records(raw)
+    assert pd.api.types.is_timedelta64_dtype(cleaned["duration"])
+
+
+def test_clean_records_metadata_dropped() -> None:
+    raw = _make_raw_records()
+    cleaned = clean_records(raw)
+    for col in ("source_name", "creation_date", "end_date"):
+        assert col not in cleaned.columns
+
+
+def test_clean_records_non_numeric_value_becomes_1() -> None:
+    raw = _make_raw_records(1)
+    raw.at[0, "value"] = "not-a-number"
+    cleaned = clean_records(raw)
+    assert cleaned["value"].iloc[0] == pytest.approx(1.0)
+
+
+# ---------------------------------------------------------------------------
+# filter_record_types
+# ---------------------------------------------------------------------------
+
+
+def test_filter_record_types_returns_dict() -> None:
+    raw = _make_raw_records()
+    cleaned = clean_records(raw)
+    result = filter_record_types(cleaned, ["ActiveEnergyBurned"])
+    assert isinstance(result, dict)
+    assert "ActiveEnergyBurned" in result
+
+
+def test_filter_record_types_renames_value_column() -> None:
+    raw = _make_raw_records()
+    cleaned = clean_records(raw)
+    result = filter_record_types(cleaned, ["ActiveEnergyBurned"])
+    assert "ActiveEnergyBurned" in result["ActiveEnergyBurned"].columns
+    assert "value" not in result["ActiveEnergyBurned"].columns
+
+
+def test_filter_record_types_all_defaults_present() -> None:
+    raw = _make_raw_records()
+    cleaned = clean_records(raw)
+    result = filter_record_types(cleaned)
+    assert set(result.keys()) == set(RECORD_TYPES)
+
+
+# ---------------------------------------------------------------------------
+# aggregate_daily
+# ---------------------------------------------------------------------------
+
+
+def test_aggregate_daily_sums_same_date() -> None:
+    by_type = _make_typed_records()
+    daily = aggregate_daily(by_type, keys=["ActiveEnergyBurned"])
+    result = daily["ActiveEnergyBurned"]
+    row_val = result.loc[result["Date"] == "2024-01-01", "ActiveEnergyBurned"]
+    assert row_val.values[0] == pytest.approx(300.0)
+
+
+def test_aggregate_daily_separate_dates() -> None:
+    by_type = _make_typed_records()
+    daily = aggregate_daily(by_type, keys=["ActiveEnergyBurned"])
+    result = daily["ActiveEnergyBurned"]
+    assert len(result) == 2  # two distinct dates
+
+
+def test_aggregate_daily_missing_key_skipped(caplog: pytest.LogCaptureFixture) -> None:
+    import logging
+
+    by_type: dict = {}
+    with caplog.at_level(logging.WARNING):
+        daily = aggregate_daily(by_type, keys=["ActiveEnergyBurned"])
+    assert "ActiveEnergyBurned" not in daily
+
+
+# ---------------------------------------------------------------------------
+# aggregate_monthly
+# ---------------------------------------------------------------------------
+
+
+def test_aggregate_monthly_sums_full_month() -> None:
+    by_type = _make_typed_records()
+    monthly = aggregate_monthly(by_type, keys=["ActiveEnergyBurned"])
+    result = monthly["ActiveEnergyBurned"]
+    row_val = result.loc[result["Date"] == "2024-01", "ActiveEnergyBurned"]
+    assert row_val.values[0] == pytest.approx(450.0)
+
+
+def test_aggregate_monthly_one_row_per_month() -> None:
+    by_type = _make_typed_records()
+    monthly = aggregate_monthly(by_type, keys=["ActiveEnergyBurned"])
+    result = monthly["ActiveEnergyBurned"]
+    assert len(result) == 1
diff --git a/tests/test_xml_parser.py b/tests/test_xml_parser.py
new file mode 100644
index 0000000..6770f0d
--- /dev/null
+++ b/tests/test_xml_parser.py
@@ -0,0 +1,95 @@
+"""Tests for the XML parsing utilities."""
+
+import textwrap
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from awai.utils.xml_parser import (
+    extract_activity_summaries,
+    extract_records,
+    extract_workouts,
+    load_xml_export,
+)
+
+# ---------------------------------------------------------------------------
+# Minimal Apple Watch XML fixture
+# ---------------------------------------------------------------------------
+
+MINIMAL_XML = textwrap.dedent("""\
+    <?xml version="1.0" encoding="UTF-8"?>
+    <HealthData locale="en_US">
+      <Record type="HKQuantityTypeIdentifierActiveEnergyBurned"
+              sourceName="Apple Watch"
+              unit="kcal"
+              creationDate="2024-01-01 08:30:00 +0000"
+              startDate="2024-01-01 08:00:00 +0000"
+              endDate="2024-01-01 08:30:00 +0000"
+              value="250"/>
+      <Workout workoutActivityType="HKWorkoutActivityTypeCycling"
+               duration="45.0" durationUnit="min"
+               sourceName="Apple Watch"
+               creationDate="2024-01-01 09:15:00 +0000"
+               startDate="2024-01-01 08:30:00 +0000"
+               endDate="2024-01-01 09:15:00 +0000"/>
+      <ActivitySummary dateComponents="2024-01-01"
+                       activeEnergyBurned="600" activeEnergyBurnedGoal="500"
+                       activeEnergyBurnedUnit="kcal"
+                       appleExerciseTime="30" appleExerciseTimeGoal="30"
+                       appleStandHours="12" appleStandHoursGoal="12"/>
+    </HealthData>
+""")
+
+
+@pytest.fixture()
+def xml_file(tmp_path: Path) -> Path:
+    """Write the minimal XML snippet to a temporary file."""
+    p = tmp_path / "export.xml"
+    p.write_text(MINIMAL_XML, encoding="utf-8")
+    return p
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_load_xml_export_returns_dict(xml_file: Path) -> None:
+    data = load_xml_export(xml_file)
+    assert isinstance(data, dict)
+    assert "HealthData" in data
+
+
+def test_extract_records_returns_dataframe(xml_file: Path) -> None:
+    data = load_xml_export(xml_file)
+    df = extract_records(data)
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) == 1
+    assert "@type" in df.columns
+
+
+def test_extract_records_value(xml_file: Path) -> None:
+    data = load_xml_export(xml_file)
+    df = extract_records(data)
+    assert df["@value"].iloc[0] == "250"
+
+
+def test_extract_workouts_returns_dataframe(xml_file: Path) -> None:
+    data = load_xml_export(xml_file)
+    df = extract_workouts(data)
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) == 1
+
+
+def test_extract_activity_summaries_returns_dataframe(xml_file: Path) -> None:
+    data = load_xml_export(xml_file)
+    df = extract_activity_summaries(data)
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) == 1
+
+
+def test_extract_activity_summaries_columns(xml_file: Path) -> None:
+    data = load_xml_export(xml_file)
+    df = extract_activity_summaries(data)
+    assert "@dateComponents" in df.columns