ansuff · Copilot · Mar 16, 2026 · Mar 16, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 default_language_version:
   python: python3
-files: '^src/.*\.pyi$|^tests/.\.pyi?$'
+files: '^src/.*\.pyi?$|^tests/.*\.pyi?$'
 fail_fast: true
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks

diff --git a/README.md b/README.md
@@ -1,33 +1,96 @@
 # Apple Watch Fitness AI
-A small project to create a fitness AI for the Apple Watch.
 
-The goal is to see if I can predict which time of the day is best to do a workout based on the user's heart rate, sleep, and activity data. Later on, I will try to predict the type of workout that the user should do based on the same data.
+A data-science project that predicts optimal workout timing from Apple Watch health data.
 
-Also, I will try to predict the best day of the week to do a workout based on the same data.
+## Goals
 
-More to come...
+- **Phase 1** – Predict the best *time of day* for a workout (heart rate, sleep, activity data).
+- **Phase 2** – Predict the best *type* of workout.
+- **Phase 3** – Predict the best *day of the week* for a workout.
 
-## Setup Instructions
+## Project structure
 
-To get started with this project, you can use the `init_setup.sh` script to install all necessary dependencies. This script will:
+```
+AppleWatchAI/
+├── src/awai/
+│   ├── entrypoint.py        # CLI entry point (fire-based)
+│   ├── utils/
+│   │   ├── xml_parser.py    # Parse Apple Watch XML exports
+│   │   └── data_cleaner.py  # Cleaning, filtering & aggregation helpers
+│   ├── models/
+│   │   └── schemas.py       # Pandera validation schemas
+│   └── tasks/
+│       ├── load_data.py     # XML → DuckDB load task
+│       └── prepare_data.py  # Full clean/filter/aggregate pipeline
+├── notebooks/exploratory/   # Jupytext-managed EDA notebooks
+├── tests/                   # pytest test suite
+├── data/                    # Apple Watch export files (git-ignored)
+└── settings.toml            # Default configuration (Dynaconf)
+```
+
+## Setup
+
+### Prerequisites
 
-- Check if Homebrew is installed and install it if necessary.
-- Install Poetry using Homebrew.
-- Install npm using Homebrew.
-- Install nodemon globally using npm.
+- Python 3.11 or later
+- [Poetry](https://python-poetry.org/) (install via `pip install poetry` or `brew install poetry`)
 
-To run the setup script, use the following command in your terminal:
+### macOS quick-start
 
 ```sh
-./init_setup.sh
+./init_setup.sh   # installs Homebrew & Poetry if missing
+poetry install    # installs all Python dependencies
 ```
 
-After running the setup script, you need to install the project dependencies using Poetry. Run the following command in your terminal:
+### Other platforms
 
 ```sh
+pip install poetry
 poetry install
 ```
 
-This will install all the dependencies specified in the pyproject.toml file.
+## Usage
+
+Export your Apple Health data from the iPhone Health app (*Profile → Export All Health Data*) and place the resulting `export.xml` inside a `data/` folder at the project root.
+
+### Load the XML export into DuckDB
+
+```sh
+poetry run awai load --xml_path=data/export.xml --db_path=data/health_data.duckdb
+```
+
+### Run the data-preparation pipeline
+
+```sh
+poetry run awai prepare --db_path=data/health_data.duckdb
+```
+
+### Explore interactively
+
+The `notebooks/exploratory/EDA.py` notebook contains the original exploratory analysis.  
+To convert it to a Jupyter notebook and open it:
+
+```sh
+poetry run jupytext --sync notebooks/exploratory/EDA.py
+poetry run jupyter lab notebooks/exploratory/EDA.ipynb
+```
+
+## Running tests
+
+```sh
+poetry run pytest
+```
+
+## Configuration
+
+Default settings live in `settings.toml`.  Override any value with an environment variable prefixed with `DYNACONF_`, e.g.:
+
+```sh
+export DYNACONF_DATA_DIR=/path/to/my/data
+```
+
+## Contributing
 
-You are now ready to start working on the project! 🚀
+1. Install dev dependencies: `poetry install`
+2. Install pre-commit hooks: `poetry run pre-commit install`
+3. Run the test suite: `poetry run pytest`
diff --git a/init_setup.sh b/init_setup.sh
@@ -21,20 +21,4 @@ else
     echo "Poetry is already installed."
 fi
 
-# Install npm using Homebrew
-if ! command_exists npm; then
-    echo "Installing npm..."
-    brew install npm
-else
-    echo "npm is already installed."
-fi
-
-# Install nodemon globally using npm
-if ! command_exists nodemon; then
-    echo "Installing nodemon globally..."
-    npm install -g nodemon
-else
-    echo "nodemon is already installed."
-fi
-
 echo "Setup complete."
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,12 @@ formats = "ipynb,py:percent"
 [tool.ruff.lint]
 extend-select = ["I"]
 
+[tool.poetry.scripts]
+awai = "awai.entrypoint:main"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
 [build-system]
-requires = ["poetry-core==1.8.2"]
+requires = ["poetry-core>=1.9.0"]
 build-backend = "poetry.core.masonry.api"
diff --git a/settings.toml b/settings.toml
@@ -0,0 +1,5 @@
+[default]
+data_dir = "data"
+xml_file_name = "export.xml"
+db_name = "health_data.duckdb"
+log_level = "INFO"
diff --git a/src/awai/__init__.py b/src/awai/__init__.py
@@ -0,0 +1 @@
+"""Apple Watch AI – workout timing prediction from Apple Health data."""
diff --git a/src/awai/entrypoint.py b/src/awai/entrypoint.py
@@ -0,0 +1,58 @@
+"""CLI entry point for Apple Watch AI."""
+
+from pathlib import Path
+
+import fire
+from loguru import logger
+
+from awai.tasks.load_data import load_to_duckdb
+from awai.tasks.prepare_data import prepare_records
+
+
+class CLI:
+    """Apple Watch AI command-line interface.
+
+    Available commands::
+
+        awai load     – parse export.xml and store data in DuckDB
+        awai prepare  – clean, filter, and aggregate the stored data
+    """
+
+    def load(
+        self,
+        xml_path: str = "data/export.xml",
+        db_path: str = "data/health_data.duckdb",
+    ) -> None:
+        """Parse an Apple Watch XML export and load it into a DuckDB database.
+
+        Args:
+            xml_path: Path to the Apple Watch ``export.xml`` file.
+            db_path:  Path where the DuckDB database will be created.
+        """
+        load_to_duckdb(Path(xml_path), Path(db_path))
+        logger.info("Load complete.")
+
+    def prepare(
+        self,
+        db_path: str = "data/health_data.duckdb",
+    ) -> None:
+        """Run the data-preparation pipeline (clean, filter, aggregate).
+
+        Args:
+            db_path: Path to the DuckDB database created by the ``load`` command.
+        """
+        by_type, daily, monthly = prepare_records(Path(db_path))
+        logger.info(
+            f"Prepared {len(by_type)} record types, "
+            f"{sum(len(v) for v in daily.values()):,} daily rows, "
+            f"{sum(len(v) for v in monthly.values()):,} monthly rows."
+        )
+
+
+def main() -> None:
+    """Fire-based CLI entry point."""
+    fire.Fire(CLI)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/awai/models/__init__.py b/src/awai/models/__init__.py
@@ -0,0 +1,5 @@
+"""Data models and validation schemas."""
+
+from awai.models.schemas import RecordsSchema
+
+__all__ = ["RecordsSchema"]
diff --git a/src/awai/models/schemas.py b/src/awai/models/schemas.py
@@ -0,0 +1,22 @@
+"""Pandera schemas for validating Apple Watch health data DataFrames."""
+
+import pandera as pa
+from pandera import Column, DataFrameSchema
+
+#: Schema for the cleaned health records DataFrame produced by
+#: :func:`~awai.utils.data_cleaner.clean_records`.
+RecordsSchema = DataFrameSchema(
+    {
+        "type": Column(str, nullable=False),
+        "Date": Column(
+            str,
+            pa.Check.str_matches(r"^\d{4}-\d{2}-\d{2}$"),
+            nullable=False,
+        ),
+        "Day": Column(str, nullable=False),
+        "Month": Column(str, nullable=False),
+        "value": Column(float, pa.Check.ge(0), nullable=False),
+        "unit": Column(str, nullable=True),
+    },
+    coerce=True,
+)
diff --git a/src/awai/tasks/__init__.py b/src/awai/tasks/__init__.py
@@ -0,0 +1,6 @@
+"""Task modules for data loading and preparation."""
+
+from awai.tasks.load_data import load_to_duckdb
+from awai.tasks.prepare_data import prepare_records
+
+__all__ = ["load_to_duckdb", "prepare_records"]
diff --git a/src/awai/tasks/load_data.py b/src/awai/tasks/load_data.py
@@ -0,0 +1,62 @@
+"""Task: load an Apple Watch XML export into a DuckDB database."""
+
+from pathlib import Path
+
+import duckdb
+from loguru import logger
+
+from awai.utils.xml_parser import (
+    extract_activity_summaries,
+    extract_records,
+    extract_workouts,
+    load_xml_export,
+)
+
+
+def load_to_duckdb(xml_path: Path, db_path: Path) -> None:
+    """Parse the Apple Watch XML export and persist data into DuckDB.
+
+    Three tables are created (if they do not already exist):
+
+    * ``records``    – time-series health measurements.
+    * ``workouts``   – individual workout sessions (flattened).
+    * ``activities`` – daily activity-ring summaries.
+
+    If all three tables are already present in *db_path* the function exits
+    early without re-parsing the XML file.
+
+    Args:
+        xml_path: Path to the Apple Watch ``export.xml`` file.
+        db_path:  Path where the DuckDB database will be created or opened.
+    """
+    xml_path = Path(xml_path)
+    db_path = Path(db_path)
+
+    con = duckdb.connect(str(db_path))
+    try:
+        already_loaded = (
+            con.execute(
+                "SELECT COUNT(*) FROM information_schema.tables "
+                "WHERE table_schema = 'main' "
+                "AND table_name IN ('records', 'workouts', 'activities')"
+            ).fetchone()[0]
+            >= 3
+        )
+        if already_loaded:
+            logger.info("All tables already exist in DuckDB – skipping load.")
+            return
+
+        logger.info(f"Parsing XML export from {xml_path} …")
+        health_data = load_xml_export(xml_path)
+
+        records_df = extract_records(health_data)
+        workout_df_flat = extract_workouts(health_data)
+        activity_df = extract_activity_summaries(health_data)
+
+        con.execute("CREATE TABLE records AS SELECT * FROM records_df")
+        con.execute("CREATE TABLE workouts AS SELECT * FROM workout_df_flat")
+        con.execute("CREATE TABLE activities AS SELECT * FROM activity_df")
+
+        logger.info("Data successfully loaded into DuckDB.")
+    finally:
+        con.close()
diff --git a/src/awai/tasks/prepare_data.py b/src/awai/tasks/prepare_data.py
@@ -0,0 +1,62 @@
+"""Task: prepare cleaned health data for downstream analysis and ML."""
+
+from pathlib import Path
+
+import duckdb
+import pandas as pd
+from loguru import logger
+
+from awai.utils.data_cleaner import (
+    aggregate_daily,
+    aggregate_monthly,
+    clean_records,
+    filter_record_types,
+)
+
+
+def load_records_from_db(db_path: Path) -> pd.DataFrame:
+    """Read the raw ``records`` table from a DuckDB database.
+
+    Args:
+        db_path: Path to the DuckDB database (created by :mod:`~awai.tasks.load_data`).
+
+    Returns:
+        Raw records :class:`~pandas.DataFrame` as stored in the database.
+    """
+    db_path = Path(db_path)
+    con = duckdb.connect(str(db_path), read_only=True)
+    try:
+        df = con.query("SELECT * FROM records").to_df()
+    finally:
+        con.close()
+    logger.info(f"Loaded {len(df):,} records from {db_path}")
+    return df
+
+
+def prepare_records(
+    db_path: Path,
+) -> tuple[dict[str, pd.DataFrame], dict[str, pd.DataFrame], dict[str, pd.DataFrame]]:
+    """Run the full data-preparation pipeline for health records.
+
+    Pipeline steps:
+
+    1. Load raw records from DuckDB.
+    2. Clean and normalise the DataFrame (:func:`~awai.utils.data_cleaner.clean_records`).
+    3. Split into per-type DataFrames (:func:`~awai.utils.data_cleaner.filter_record_types`).
+    4. Compute daily aggregations (:func:`~awai.utils.data_cleaner.aggregate_daily`).
+    5. Compute monthly aggregations (:func:`~awai.utils.data_cleaner.aggregate_monthly`).
+
+    Args:
+        db_path: Path to the DuckDB database.
+
+    Returns:
+        A 3-tuple ``(records_by_type, daily, monthly)`` where each element is
+        a ``dict`` mapping record-type names to :class:`~pandas.DataFrame` objects.
+    """
+    raw_df = load_records_from_db(db_path)
+    cleaned = clean_records(raw_df)
+    by_type = filter_record_types(cleaned)
+    daily = aggregate_daily(by_type)
+    monthly = aggregate_monthly(by_type)
+    logger.info("Data preparation complete.")
+    return by_type, daily, monthly
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Apple Watch AI – workout timing prediction from Apple Health data."""