From c91783707d611c6c86bc6f4ae6a00a011b0658ec Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 10:09:21 +0000 Subject: [PATCH 1/2] Initial plan From fa6bee382e449437487cbe03c5d2114d11453a88 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 10:24:40 +0000 Subject: [PATCH 2/2] Implement all empty modules, add tests, fix config files Co-authored-by: ansuff <63700848+ansuff@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- README.md | 93 ++++++++++-- init_setup.sh | 16 --- pyproject.toml | 8 +- settings.toml | 5 + src/awai/__init__.py | 1 + src/awai/entrypoint.py | 58 ++++++++ src/awai/models/__init__.py | 5 + src/awai/models/schemas.py | 22 +++ src/awai/tasks/__init__.py | 6 + src/awai/tasks/load_data.py | 62 ++++++++ src/awai/tasks/prepare_data.py | 62 ++++++++ src/awai/utils/__init__.py | 27 ++++ src/awai/utils/data_cleaner.py | 251 +++++++++++++++++++++++++++++++++ src/awai/utils/xml_parser.py | 87 ++++++++++++ tests/__init__.py | 0 tests/test_data_cleaner.py | 208 +++++++++++++++++++++++++++ tests/test_xml_parser.py | 95 +++++++++++++ 18 files changed, 975 insertions(+), 33 deletions(-) create mode 100644 src/awai/models/schemas.py create mode 100644 src/awai/tasks/load_data.py create mode 100644 src/awai/tasks/prepare_data.py create mode 100644 src/awai/utils/data_cleaner.py create mode 100644 src/awai/utils/xml_parser.py create mode 100644 tests/__init__.py create mode 100644 tests/test_data_cleaner.py create mode 100644 tests/test_xml_parser.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 96a6bec..22410fb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ default_language_version: python: python3 -files: '^src/.*\.pyi$|^tests/.\.pyi?$' +files: '^src/.*\.pyi?$|^tests/.*\.pyi?$' fail_fast: true repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/README.md b/README.md index e494c32..89abc80 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,96 @@ # Apple Watch Fitness AI -A small project to create a fitness AI for the Apple Watch. -The goal is to see if I can predict which time of the day is best to do a workout based on the user's heart rate, sleep, and activity data. Later on, I will try to predict the type of workout that the user should do based on the same data. +A data-science project that predicts optimal workout timing from Apple Watch health data. -Also, I will try to predict the best day of the week to do a workout based on the same data. +## Goals -More to come... +- **Phase 1** – Predict the best *time of day* for a workout (heart rate, sleep, activity data). +- **Phase 2** – Predict the best *type* of workout. +- **Phase 3** – Predict the best *day of the week* for a workout. -## Setup Instructions +## Project structure -To get started with this project, you can use the `init_setup.sh` script to install all necessary dependencies. This script will: +``` +AppleWatchAI/ +├── src/awai/ +│ ├── entrypoint.py # CLI entry point (fire-based) +│ ├── utils/ +│ │ ├── xml_parser.py # Parse Apple Watch XML exports +│ │ └── data_cleaner.py # Cleaning, filtering & aggregation helpers +│ ├── models/ +│ │ └── schemas.py # Pandera validation schemas +│ └── tasks/ +│ ├── load_data.py # XML → DuckDB load task +│ └── prepare_data.py # Full clean/filter/aggregate pipeline +├── notebooks/exploratory/ # Jupytext-managed EDA notebooks +├── tests/ # pytest test suite +├── data/ # Apple Watch export files (git-ignored) +└── settings.toml # Default configuration (Dynaconf) +``` + +## Setup + +### Prerequisites -- Check if Homebrew is installed and install it if necessary. -- Install Poetry using Homebrew. -- Install npm using Homebrew. -- Install nodemon globally using npm. +- Python 3.11 or later +- [Poetry](https://python-poetry.org/) (install via `pip install poetry` or `brew install poetry`) -To run the setup script, use the following command in your terminal: +### macOS quick-start ```sh -./init_setup.sh +./init_setup.sh # installs Homebrew & Poetry if missing +poetry install # installs all Python dependencies ``` -After running the setup script, you need to install the project dependencies using Poetry. Run the following command in your terminal: +### Other platforms ```sh +pip install poetry poetry install ``` -This will install all the dependencies specified in the pyproject.toml file. +## Usage + +Export your Apple Health data from the iPhone Health app (*Profile → Export All Health Data*) and place the resulting `export.xml` inside a `data/` folder at the project root. + +### Load the XML export into DuckDB + +```sh +poetry run awai load --xml_path=data/export.xml --db_path=data/health_data.duckdb +``` + +### Run the data-preparation pipeline + +```sh +poetry run awai prepare --db_path=data/health_data.duckdb +``` + +### Explore interactively + +The `notebooks/exploratory/EDA.py` notebook contains the original exploratory analysis. +To convert it to a Jupyter notebook and open it: + +```sh +poetry run jupytext --sync notebooks/exploratory/EDA.py +poetry run jupyter lab notebooks/exploratory/EDA.ipynb +``` + +## Running tests + +```sh +poetry run pytest +``` + +## Configuration + +Default settings live in `settings.toml`. Override any value with an environment variable prefixed with `DYNACONF_`, e.g.: + +```sh +export DYNACONF_DATA_DIR=/path/to/my/data +``` + +## Contributing -You are now ready to start working on the project! 🚀 \ No newline at end of file +1. Install dev dependencies: `poetry install` +2. Install pre-commit hooks: `poetry run pre-commit install` +3. Run the test suite: `poetry run pytest` diff --git a/init_setup.sh b/init_setup.sh index f968e2a..ee3c78b 100644 --- a/init_setup.sh +++ b/init_setup.sh @@ -21,20 +21,4 @@ else echo "Poetry is already installed." fi -# Install npm using Homebrew -if ! command_exists npm; then - echo "Installing npm..." - brew install npm -else - echo "npm is already installed." -fi - -# Install nodemon globally using npm -if ! command_exists nodemon; then - echo "Installing nodemon globally..." - npm install -g nodemon -else - echo "nodemon is already installed." -fi - echo "Setup complete." \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c7e0a7c..2ddac4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,12 @@ formats = "ipynb,py:percent" [tool.ruff.lint] extend-select = ["I"] +[tool.poetry.scripts] +awai = "awai.entrypoint:main" + +[tool.pytest.ini_options] +testpaths = ["tests"] + [build-system] -requires = ["poetry-core==1.8.2"] +requires = ["poetry-core>=1.9.0"] build-backend = "poetry.core.masonry.api" diff --git a/settings.toml b/settings.toml index e69de29..532124f 100644 --- a/settings.toml +++ b/settings.toml @@ -0,0 +1,5 @@ +[default] +data_dir = "data" +xml_file_name = "export.xml" +db_name = "health_data.duckdb" +log_level = "INFO" diff --git a/src/awai/__init__.py b/src/awai/__init__.py index e69de29..a430591 100644 --- a/src/awai/__init__.py +++ b/src/awai/__init__.py @@ -0,0 +1 @@ +"""Apple Watch AI – workout timing prediction from Apple Health data.""" diff --git a/src/awai/entrypoint.py b/src/awai/entrypoint.py index e69de29..29b4362 100644 --- a/src/awai/entrypoint.py +++ b/src/awai/entrypoint.py @@ -0,0 +1,58 @@ +"""CLI entry point for Apple Watch AI.""" + +from pathlib import Path + +import fire +from loguru import logger + +from awai.tasks.load_data import load_to_duckdb +from awai.tasks.prepare_data import prepare_records + + +class CLI: + """Apple Watch AI command-line interface. + + Available commands:: + + awai load – parse export.xml and store data in DuckDB + awai prepare – clean, filter, and aggregate the stored data + """ + + def load( + self, + xml_path: str = "data/export.xml", + db_path: str = "data/health_data.duckdb", + ) -> None: + """Parse an Apple Watch XML export and load it into a DuckDB database. + + Args: + xml_path: Path to the Apple Watch ``export.xml`` file. + db_path: Path where the DuckDB database will be created. + """ + load_to_duckdb(Path(xml_path), Path(db_path)) + logger.info("Load complete.") + + def prepare( + self, + db_path: str = "data/health_data.duckdb", + ) -> None: + """Run the data-preparation pipeline (clean, filter, aggregate). + + Args: + db_path: Path to the DuckDB database created by the ``load`` command. + """ + by_type, daily, monthly = prepare_records(Path(db_path)) + logger.info( + f"Prepared {len(by_type)} record types, " + f"{sum(len(v) for v in daily.values()):,} daily rows, " + f"{sum(len(v) for v in monthly.values()):,} monthly rows." + ) + + +def main() -> None: + """Fire-based CLI entry point.""" + fire.Fire(CLI) + + +if __name__ == "__main__": + main() diff --git a/src/awai/models/__init__.py b/src/awai/models/__init__.py index e69de29..39b5cf2 100644 --- a/src/awai/models/__init__.py +++ b/src/awai/models/__init__.py @@ -0,0 +1,5 @@ +"""Data models and validation schemas.""" + +from awai.models.schemas import RecordsSchema + +__all__ = ["RecordsSchema"] diff --git a/src/awai/models/schemas.py b/src/awai/models/schemas.py new file mode 100644 index 0000000..8d7c6a4 --- /dev/null +++ b/src/awai/models/schemas.py @@ -0,0 +1,22 @@ +"""Pandera schemas for validating Apple Watch health data DataFrames.""" + +import pandera as pa +from pandera import Column, DataFrameSchema + +#: Schema for the cleaned health records DataFrame produced by +#: :func:`~awai.utils.data_cleaner.clean_records`. +RecordsSchema = DataFrameSchema( + { + "type": Column(str, nullable=False), + "Date": Column( + str, + pa.Check.str_matches(r"^\d{4}-\d{2}-\d{2}$"), + nullable=False, + ), + "Day": Column(str, nullable=False), + "Month": Column(str, nullable=False), + "value": Column(float, pa.Check.ge(0), nullable=False), + "unit": Column(str, nullable=True), + }, + coerce=True, +) diff --git a/src/awai/tasks/__init__.py b/src/awai/tasks/__init__.py index e69de29..a02d07b 100644 --- a/src/awai/tasks/__init__.py +++ b/src/awai/tasks/__init__.py @@ -0,0 +1,6 @@ +"""Task modules for data loading and preparation.""" + +from awai.tasks.load_data import load_to_duckdb +from awai.tasks.prepare_data import prepare_records + +__all__ = ["load_to_duckdb", "prepare_records"] diff --git a/src/awai/tasks/load_data.py b/src/awai/tasks/load_data.py new file mode 100644 index 0000000..b4193ae --- /dev/null +++ b/src/awai/tasks/load_data.py @@ -0,0 +1,62 @@ +"""Task: load an Apple Watch XML export into a DuckDB database.""" + +from pathlib import Path + +import duckdb +from loguru import logger + +from awai.utils.xml_parser import ( + extract_activity_summaries, + extract_records, + extract_workouts, + load_xml_export, +) + + +def load_to_duckdb(xml_path: Path, db_path: Path) -> None: + """Parse the Apple Watch XML export and persist data into DuckDB. + + Three tables are created (if they do not already exist): + + * ``records`` – time-series health measurements. + * ``workouts`` – individual workout sessions (flattened). + * ``activities`` – daily activity-ring summaries. + + If all three tables are already present in *db_path* the function exits + early without re-parsing the XML file. + + Args: + xml_path: Path to the Apple Watch ``export.xml`` file. + db_path: Path where the DuckDB database will be created or opened. + """ + xml_path = Path(xml_path) + db_path = Path(db_path) + + con = duckdb.connect(str(db_path)) + try: + already_loaded = ( + con.execute( + "SELECT COUNT(*) FROM information_schema.tables " + "WHERE table_schema = 'main' " + "AND table_name IN ('records', 'workouts', 'activities')" + ).fetchone()[0] + >= 3 + ) + if already_loaded: + logger.info("All tables already exist in DuckDB – skipping load.") + return + + logger.info(f"Parsing XML export from {xml_path} …") + health_data = load_xml_export(xml_path) + + records_df = extract_records(health_data) + workout_df_flat = extract_workouts(health_data) + activity_df = extract_activity_summaries(health_data) + + con.execute("CREATE TABLE records AS SELECT * FROM records_df") + con.execute("CREATE TABLE workouts AS SELECT * FROM workout_df_flat") + con.execute("CREATE TABLE activities AS SELECT * FROM activity_df") + + logger.info("Data successfully loaded into DuckDB.") + finally: + con.close() diff --git a/src/awai/tasks/prepare_data.py b/src/awai/tasks/prepare_data.py new file mode 100644 index 0000000..6263fb7 --- /dev/null +++ b/src/awai/tasks/prepare_data.py @@ -0,0 +1,62 @@ +"""Task: prepare cleaned health data for downstream analysis and ML.""" + +from pathlib import Path + +import duckdb +import pandas as pd +from loguru import logger + +from awai.utils.data_cleaner import ( + aggregate_daily, + aggregate_monthly, + clean_records, + filter_record_types, +) + + +def load_records_from_db(db_path: Path) -> pd.DataFrame: + """Read the raw ``records`` table from a DuckDB database. + + Args: + db_path: Path to the DuckDB database (created by :mod:`~awai.tasks.load_data`). + + Returns: + Raw records :class:`~pandas.DataFrame` as stored in the database. + """ + db_path = Path(db_path) + con = duckdb.connect(str(db_path), read_only=True) + try: + df = con.query("SELECT * FROM records").to_df() + finally: + con.close() + logger.info(f"Loaded {len(df):,} records from {db_path}") + return df + + +def prepare_records( + db_path: Path, +) -> tuple[dict[str, pd.DataFrame], dict[str, pd.DataFrame], dict[str, pd.DataFrame]]: + """Run the full data-preparation pipeline for health records. + + Pipeline steps: + + 1. Load raw records from DuckDB. + 2. Clean and normalise the DataFrame (:func:`~awai.utils.data_cleaner.clean_records`). + 3. Split into per-type DataFrames (:func:`~awai.utils.data_cleaner.filter_record_types`). + 4. Compute daily aggregations (:func:`~awai.utils.data_cleaner.aggregate_daily`). + 5. Compute monthly aggregations (:func:`~awai.utils.data_cleaner.aggregate_monthly`). + + Args: + db_path: Path to the DuckDB database. + + Returns: + A 3-tuple ``(records_by_type, daily, monthly)`` where each element is + a ``dict`` mapping record-type names to :class:`~pandas.DataFrame` objects. + """ + raw_df = load_records_from_db(db_path) + cleaned = clean_records(raw_df) + by_type = filter_record_types(cleaned) + daily = aggregate_daily(by_type) + monthly = aggregate_monthly(by_type) + logger.info("Data preparation complete.") + return by_type, daily, monthly diff --git a/src/awai/utils/__init__.py b/src/awai/utils/__init__.py index e69de29..e009f3a 100644 --- a/src/awai/utils/__init__.py +++ b/src/awai/utils/__init__.py @@ -0,0 +1,27 @@ +"""Utility modules for Apple Watch AI.""" + +from awai.utils.data_cleaner import ( + aggregate_daily, + aggregate_monthly, + camel_to_snake, + clean_records, + filter_record_types, +) +from awai.utils.xml_parser import ( + extract_activity_summaries, + extract_records, + extract_workouts, + load_xml_export, +) + +__all__ = [ + "camel_to_snake", + "clean_records", + "filter_record_types", + "aggregate_daily", + "aggregate_monthly", + "load_xml_export", + "extract_records", + "extract_workouts", + "extract_activity_summaries", +] diff --git a/src/awai/utils/data_cleaner.py b/src/awai/utils/data_cleaner.py new file mode 100644 index 0000000..2913dc8 --- /dev/null +++ b/src/awai/utils/data_cleaner.py @@ -0,0 +1,251 @@ +"""Data cleaning and aggregation utilities for Apple Watch health data.""" + +import re + +import pandas as pd +from loguru import logger + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +#: Health record types that are relevant to workout-timing analysis. +RECORD_TYPES: list[str] = [ + "BodyMass", + "ActiveEnergyBurned", + "BasalEnergyBurned", + "DistanceWalkingRunning", + "StepCount", + "AppleStandTime", + "WalkingSpeed", + "DistanceCycling", + "HeartRateVariabilitySDNN", + "RestingHeartRate", + "WalkingHeartRateAverage", + "VO2Max", + "HeartRateRecoveryOneMinute", + "PhysicalEffort", + "SleepAnalysis", +] + +#: Columns dropped during cleaning (metadata that is not useful for analysis). +COLUMNS_TO_DROP: list[str] = [ + "source_name", + "source_version", + "device", + "creation_date", + "end_date", + "metadata_entry", + "heart_rate_variability_metadata_list", +] + +#: Record types whose daily values should be *summed* (rather than averaged). +DAILY_SUM_KEYS: list[str] = [ + "BasalEnergyBurned", + "ActiveEnergyBurned", + "DistanceWalkingRunning", + "StepCount", + "AppleStandTime", + "DistanceCycling", + "PhysicalEffort", +] + +# --------------------------------------------------------------------------- +# Low-level helpers +# --------------------------------------------------------------------------- + + +def camel_to_snake(name: str) -> str: + """Convert a camelCase or ``@camelCase`` column name to ``snake_case``. + + Args: + name: The original column name (may start with ``@``). + + Returns: + The snake_case equivalent. + + Examples: + >>> camel_to_snake("startDate") + 'start_date' + >>> camel_to_snake("@type") + 'type' + """ + name = re.sub("@", "", name) + name = re.sub(r"(? pd.DataFrame: + """Rename all DataFrame columns from camelCase / ``@camelCase`` to snake_case. + + Args: + df: Input DataFrame. + + Returns: + A copy of *df* with renamed columns. + """ + df = df.copy() + df.columns = [camel_to_snake(col) for col in df.columns] + return df + + +# --------------------------------------------------------------------------- +# Records cleaning +# --------------------------------------------------------------------------- + + +def clean_records(df: pd.DataFrame) -> pd.DataFrame: + """Clean and transform the raw health records DataFrame. + + Steps performed: + + 1. Rename columns to ``snake_case``. + 2. Parse ``start_date`` / ``end_date`` and compute ``duration``. + 3. Drop metadata columns (see :data:`COLUMNS_TO_DROP`). + 4. Add ``Day``, ``Date``, and ``Month`` helper columns from ``start_date``. + 5. Coerce ``value`` to :class:`float` (fill non-numeric rows with ``1.0``). + 6. Strip ``HKQuantityTypeIdentifier`` / ``HKCategoryTypeIdentifier`` prefixes + from the ``type`` column. + + Args: + df: Raw records DataFrame as produced by + :func:`~awai.utils.xml_parser.extract_records`. + + Returns: + Cleaned DataFrame ready for further analysis. + """ + df = rename_columns(df) + + # Parse dates and compute duration before any columns are dropped. + for col in ("start_date", "end_date"): + if col in df.columns: + df[col] = pd.to_datetime(df[col]) + if "start_date" in df.columns and "end_date" in df.columns: + df["duration"] = df["end_date"] - df["start_date"] + + # Drop metadata columns that are present in this export. + existing_drops = [c for c in COLUMNS_TO_DROP if c in df.columns] + df = df.drop(columns=existing_drops) + + # Add calendar helper columns derived from start_date. + if "start_date" in df.columns: + df["Day"] = df["start_date"].dt.strftime("%A") + df["Date"] = df["start_date"].dt.strftime("%Y-%m-%d") + df["Month"] = df["start_date"].dt.strftime("%B") + + # Coerce value to float; records that have no numeric value (e.g. presence + # indicators such as SleepAnalysis) are assigned 1.0 so they count as one + # occurrence and can still be summed/aggregated meaningfully. + if "value" in df.columns: + df["value"] = pd.to_numeric(df["value"], errors="coerce").fillna(1.0).astype(float) + + # Shorten Apple's verbose type identifiers. + if "type" in df.columns: + df["type"] = ( + df["type"] + .str.replace("HKQuantityTypeIdentifier", "", regex=False) + .str.replace("HKCategoryTypeIdentifier", "", regex=False) + ) + + # Reorder to a canonical column order; any extra columns go at the end. + desired = ["type", "Date", "Day", "Month", "value", "unit", "duration"] + available = [c for c in desired if c in df.columns] + remaining = [c for c in df.columns if c not in desired] + df = df[available + remaining] + + logger.info(f"Cleaned records: {len(df):,} rows, {len(df.columns)} columns") + return df + + +# --------------------------------------------------------------------------- +# Filtering & aggregation +# --------------------------------------------------------------------------- + + +def filter_record_types( + df: pd.DataFrame, + record_types: list[str] | None = None, +) -> dict[str, pd.DataFrame]: + """Split a cleaned records DataFrame into one sub-DataFrame per health type. + + Args: + df: Cleaned records DataFrame (output of :func:`clean_records`). + record_types: Types to extract. Defaults to :data:`RECORD_TYPES`. + + Returns: + A ``dict`` mapping each record type name to a filtered and renamed + DataFrame where the ``value`` column is renamed to the type name. + """ + if record_types is None: + record_types = RECORD_TYPES + + result: dict[str, pd.DataFrame] = {} + for rt in record_types: + mask = df["type"].str.contains(rt, regex=False) + subset = df.loc[mask].rename(columns={"value": rt}).sort_values("Date") + result[rt] = subset + + logger.info(f"Filtered into {len(result)} record type groups") + return result + + +def aggregate_daily( + records_by_type: dict[str, pd.DataFrame], + keys: list[str] | None = None, +) -> dict[str, pd.DataFrame]: + """Aggregate per-type DataFrames to *daily* totals (sum). + + Args: + records_by_type: Output of :func:`filter_record_types`. + keys: Types to aggregate. Defaults to :data:`DAILY_SUM_KEYS`. + + Returns: + A ``dict`` mapping each type name to a daily-aggregated DataFrame. + """ + if keys is None: + keys = DAILY_SUM_KEYS + + daily: dict[str, pd.DataFrame] = {} + for key in keys: + if key not in records_by_type: + logger.warning(f"Key '{key}' not found in records – skipping daily aggregation") + continue + df = records_by_type[key] + daily[key] = ( + df.groupby("Date") + .agg({key: "sum", "Day": lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0]}) + .reset_index() + ) + + return daily + + +def aggregate_monthly( + records_by_type: dict[str, pd.DataFrame], + keys: list[str] | None = None, +) -> dict[str, pd.DataFrame]: + """Aggregate per-type DataFrames to *monthly* totals (sum). + + Args: + records_by_type: Output of :func:`filter_record_types`. + keys: Types to aggregate. Defaults to :data:`DAILY_SUM_KEYS`. + + Returns: + A ``dict`` mapping each type name to a monthly-aggregated DataFrame. + """ + if keys is None: + keys = DAILY_SUM_KEYS + + monthly: dict[str, pd.DataFrame] = {} + for key in keys: + if key not in records_by_type: + logger.warning(f"Key '{key}' not found in records – skipping monthly aggregation") + continue + df = records_by_type[key] + monthly[key] = ( + df.groupby(df["Date"].str[:-3]) + .agg({key: "sum", "Month": lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0]}) + .reset_index() + ) + + return monthly diff --git a/src/awai/utils/xml_parser.py b/src/awai/utils/xml_parser.py new file mode 100644 index 0000000..c35af8d --- /dev/null +++ b/src/awai/utils/xml_parser.py @@ -0,0 +1,87 @@ +"""Utilities for parsing Apple Watch XML health data exports.""" + +from pathlib import Path + +import pandas as pd +import xmltodict +from loguru import logger + + +def load_xml_export(xml_path: Path) -> dict: + """Load an Apple Watch health export XML file and return the parsed dict. + + Args: + xml_path: Path to the Apple Watch ``export.xml`` file. + + Returns: + A nested dictionary produced by ``xmltodict.parse``. + """ + xml_path = Path(xml_path) + logger.info(f"Loading XML export from {xml_path}") + with open(xml_path, "r", encoding="utf-8") as fh: + return xmltodict.parse(fh.read()) + + +def _ensure_list(value: dict | list) -> list: + """Ensure *value* is a list, wrapping a single dict in one if necessary. + + ``xmltodict`` returns a ``dict`` (rather than a one-element ``list``) when + there is only a single child element in the XML. This helper normalises + both cases so callers can always iterate over a list. + """ + return value if isinstance(value, list) else [value] + + +def extract_records(health_data: dict) -> pd.DataFrame: + """Extract health records from the parsed XML dictionary. + + Records contain time-series measurements such as heart rate, step count, + active energy burned, and many other health metrics. + + Args: + health_data: Parsed XML dictionary (returned by :func:`load_xml_export`). + + Returns: + A :class:`~pandas.DataFrame` with one row per health record. + """ + records_list = _ensure_list(health_data["HealthData"]["Record"]) + df = pd.DataFrame(records_list) + logger.info(f"Extracted {len(df)} health records") + return df + + +def extract_workouts(health_data: dict) -> pd.DataFrame: + """Extract workout data from the parsed XML and flatten nested structures. + + Apple Watch workout entries can contain nested metadata which is flattened + using :func:`pandas.json_normalize`. + + Args: + health_data: Parsed XML dictionary (returned by :func:`load_xml_export`). + + Returns: + A flat :class:`~pandas.DataFrame` with one row per workout. + """ + workouts_list = _ensure_list(health_data["HealthData"]["Workout"]) + workout_df = pd.DataFrame(workouts_list) + df = pd.json_normalize(workout_df.to_dict(orient="records")) + logger.info(f"Extracted {len(df)} workouts") + return df + + +def extract_activity_summaries(health_data: dict) -> pd.DataFrame: + """Extract daily activity summaries (rings data) from the parsed XML. + + Activity summaries capture the three Apple Watch activity rings: active + energy burned, exercise minutes, and stand hours – per calendar day. + + Args: + health_data: Parsed XML dictionary (returned by :func:`load_xml_export`). + + Returns: + A :class:`~pandas.DataFrame` with one row per day. + """ + activity_list = _ensure_list(health_data["HealthData"]["ActivitySummary"]) + df = pd.DataFrame(activity_list) + logger.info(f"Extracted {len(df)} activity summaries") + return df diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data_cleaner.py b/tests/test_data_cleaner.py new file mode 100644 index 0000000..ec67518 --- /dev/null +++ b/tests/test_data_cleaner.py @@ -0,0 +1,208 @@ +"""Tests for the data cleaning and aggregation utilities.""" + +import pandas as pd +import pytest + +from awai.utils.data_cleaner import ( + DAILY_SUM_KEYS, + RECORD_TYPES, + aggregate_daily, + aggregate_monthly, + camel_to_snake, + clean_records, + filter_record_types, + rename_columns, +) + +# --------------------------------------------------------------------------- +# camel_to_snake +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "input_name, expected", + [ + ("camelCase", "camel_case"), + ("@type", "type"), + ("startDate", "start_date"), + ("VO2Max", "v_o2_max"), + ("snake_case", "snake_case"), + ("ActiveEnergyBurned", "active_energy_burned"), + ], +) +def test_camel_to_snake(input_name: str, expected: str) -> None: + assert camel_to_snake(input_name) == expected + + +# --------------------------------------------------------------------------- +# rename_columns +# --------------------------------------------------------------------------- + + +def test_rename_columns() -> None: + df = pd.DataFrame(columns=["@type", "startDate", "endDate"]) + renamed = rename_columns(df) + assert list(renamed.columns) == ["type", "start_date", "end_date"] + + +def test_rename_columns_does_not_mutate_original() -> None: + df = pd.DataFrame(columns=["@type", "startDate"]) + rename_columns(df) + assert "@type" in df.columns # original is unchanged + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +def _make_raw_records(n: int = 5) -> pd.DataFrame: + """Return a small synthetic raw-records DataFrame (mimics xmltodict output).""" + rows = [ + { + "@type": "HKQuantityTypeIdentifierActiveEnergyBurned", + "startDate": "2024-01-01 08:00:00 +0000", + "endDate": "2024-01-01 08:30:00 +0000", + "creationDate": "2024-01-01 08:30:00 +0000", + "value": str(i * 50), + "unit": "kcal", + "sourceName": "Apple Watch", + } + for i in range(1, n + 1) + ] + return pd.DataFrame(rows) + + +def _make_typed_records() -> dict[str, pd.DataFrame]: + """Return a minimal per-type dict suitable for aggregation tests.""" + df = pd.DataFrame( + { + "ActiveEnergyBurned": [100.0, 200.0, 150.0], + "Date": ["2024-01-01", "2024-01-01", "2024-01-02"], + "Day": ["Monday", "Monday", "Tuesday"], + "Month": ["January", "January", "January"], + } + ) + return {"ActiveEnergyBurned": df} + + +# --------------------------------------------------------------------------- +# clean_records +# --------------------------------------------------------------------------- + + +def test_clean_records_expected_columns() -> None: + raw = _make_raw_records() + cleaned = clean_records(raw) + for col in ("type", "Date", "Day", "Month", "value", "unit", "duration"): + assert col in cleaned.columns, f"Expected column '{col}' missing" + + +def test_clean_records_type_prefix_stripped() -> None: + raw = _make_raw_records() + cleaned = clean_records(raw) + assert not cleaned["type"].str.startswith("HKQuantityTypeIdentifier").any() + assert not cleaned["type"].str.startswith("HKCategoryTypeIdentifier").any() + + +def test_clean_records_value_is_float() -> None: + raw = _make_raw_records() + cleaned = clean_records(raw) + assert pd.api.types.is_float_dtype(cleaned["value"]) + + +def test_clean_records_duration_is_timedelta() -> None: + raw = _make_raw_records() + cleaned = clean_records(raw) + assert pd.api.types.is_timedelta64_dtype(cleaned["duration"]) + + +def test_clean_records_metadata_dropped() -> None: + raw = _make_raw_records() + cleaned = clean_records(raw) + for col in ("source_name", "creation_date", "end_date"): + assert col not in cleaned.columns + + +def test_clean_records_non_numeric_value_becomes_1() -> None: + raw = _make_raw_records(1) + raw.at[0, "value"] = "not-a-number" + cleaned = clean_records(raw) + assert cleaned["value"].iloc[0] == pytest.approx(1.0) + + +# --------------------------------------------------------------------------- +# filter_record_types +# --------------------------------------------------------------------------- + + +def test_filter_record_types_returns_dict() -> None: + raw = _make_raw_records() + cleaned = clean_records(raw) + result = filter_record_types(cleaned, ["ActiveEnergyBurned"]) + assert isinstance(result, dict) + assert "ActiveEnergyBurned" in result + + +def test_filter_record_types_renames_value_column() -> None: + raw = _make_raw_records() + cleaned = clean_records(raw) + result = filter_record_types(cleaned, ["ActiveEnergyBurned"]) + assert "ActiveEnergyBurned" in result["ActiveEnergyBurned"].columns + assert "value" not in result["ActiveEnergyBurned"].columns + + +def test_filter_record_types_all_defaults_present() -> None: + raw = _make_raw_records() + cleaned = clean_records(raw) + result = filter_record_types(cleaned) + assert set(result.keys()) == set(RECORD_TYPES) + + +# --------------------------------------------------------------------------- +# aggregate_daily +# --------------------------------------------------------------------------- + + +def test_aggregate_daily_sums_same_date() -> None: + by_type = _make_typed_records() + daily = aggregate_daily(by_type, keys=["ActiveEnergyBurned"]) + result = daily["ActiveEnergyBurned"] + row_val = result.loc[result["Date"] == "2024-01-01", "ActiveEnergyBurned"] + assert row_val.values[0] == pytest.approx(300.0) + + +def test_aggregate_daily_separate_dates() -> None: + by_type = _make_typed_records() + daily = aggregate_daily(by_type, keys=["ActiveEnergyBurned"]) + result = daily["ActiveEnergyBurned"] + assert len(result) == 2 # two distinct dates + + +def test_aggregate_daily_missing_key_skipped(caplog: pytest.LogCaptureFixture) -> None: + import logging + + by_type: dict = {} + with caplog.at_level(logging.WARNING): + daily = aggregate_daily(by_type, keys=["ActiveEnergyBurned"]) + assert "ActiveEnergyBurned" not in daily + + +# --------------------------------------------------------------------------- +# aggregate_monthly +# --------------------------------------------------------------------------- + + +def test_aggregate_monthly_sums_full_month() -> None: + by_type = _make_typed_records() + monthly = aggregate_monthly(by_type, keys=["ActiveEnergyBurned"]) + result = monthly["ActiveEnergyBurned"] + row_val = result.loc[result["Date"] == "2024-01", "ActiveEnergyBurned"] + assert row_val.values[0] == pytest.approx(450.0) + + +def test_aggregate_monthly_one_row_per_month() -> None: + by_type = _make_typed_records() + monthly = aggregate_monthly(by_type, keys=["ActiveEnergyBurned"]) + result = monthly["ActiveEnergyBurned"] + assert len(result) == 1 diff --git a/tests/test_xml_parser.py b/tests/test_xml_parser.py new file mode 100644 index 0000000..6770f0d --- /dev/null +++ b/tests/test_xml_parser.py @@ -0,0 +1,95 @@ +"""Tests for the XML parsing utilities.""" + +import textwrap +from pathlib import Path + +import pandas as pd +import pytest + +from awai.utils.xml_parser import ( + extract_activity_summaries, + extract_records, + extract_workouts, + load_xml_export, +) + +# --------------------------------------------------------------------------- +# Minimal Apple Watch XML fixture +# --------------------------------------------------------------------------- + +MINIMAL_XML = textwrap.dedent("""\ + + + + + + +""") + + +@pytest.fixture() +def xml_file(tmp_path: Path) -> Path: + """Write the minimal XML snippet to a temporary file.""" + p = tmp_path / "export.xml" + p.write_text(MINIMAL_XML, encoding="utf-8") + return p + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_load_xml_export_returns_dict(xml_file: Path) -> None: + data = load_xml_export(xml_file) + assert isinstance(data, dict) + assert "HealthData" in data + + +def test_extract_records_returns_dataframe(xml_file: Path) -> None: + data = load_xml_export(xml_file) + df = extract_records(data) + assert isinstance(df, pd.DataFrame) + assert len(df) == 1 + assert "@type" in df.columns + + +def test_extract_records_value(xml_file: Path) -> None: + data = load_xml_export(xml_file) + df = extract_records(data) + assert df["@value"].iloc[0] == "250" + + +def test_extract_workouts_returns_dataframe(xml_file: Path) -> None: + data = load_xml_export(xml_file) + df = extract_workouts(data) + assert isinstance(df, pd.DataFrame) + assert len(df) == 1 + + +def test_extract_activity_summaries_returns_dataframe(xml_file: Path) -> None: + data = load_xml_export(xml_file) + df = extract_activity_summaries(data) + assert isinstance(df, pd.DataFrame) + assert len(df) == 1 + + +def test_extract_activity_summaries_columns(xml_file: Path) -> None: + data = load_xml_export(xml_file) + df = extract_activity_summaries(data) + assert "@dateComponents" in df.columns