diff --git a/src/climatevision/data/README.md b/src/climatevision/data/README.md new file mode 100644 index 0000000..ee3ea52 --- /dev/null +++ b/src/climatevision/data/README.md @@ -0,0 +1,46 @@ +# Data Pipeline + +Sentinel-2 ingestion, band mapping, and preprocessing for ClimateVision. + +## Modules + +| File | Purpose | +|------|---------| +| `gee_downloader.py` | Download real Sentinel-2 tiles from Google Earth Engine for a given bbox + date range. Falls back to a labelled synthetic tile (`is_synthetic: true`) when GEE credentials are missing. | +| `band_mapping.py` | Single source of truth for which spectral bands each analysis type requires. Reads from `config.yaml`. | +| `preprocessing.py` | Cloud masking (SCL band), normalisation, resampling 20m bands to 10m, tiling to 256×256. | +| `transforms.py` | Augmentation pipeline (flips, rotations, spectral jitter) for training DataLoaders. | +| `sampling.py` | Tile sampling strategies (random, balanced, stratified by region). | +| `quality.py` | Per-tile QA (cloud %, NaN ratio, band coverage). | +| `validation.py` | Schema validation for incoming requests and downloaded tiles. | + +## Analysis-Type Band Contract + +Every analysis type has its own band list in `config.yaml`. The pipeline must use `get_bands_for_analysis(analysis_type)` — never hardcode band lists. + +| Analysis | Bands | Channels | +|----------|-------|----------| +| `deforestation` | B04, B03, B02, B08 | 4 | +| `ice_melting` | B02, B03, B04, B11 | 4 | +| `flooding` | B03, B08, B11 | 3 | + +## Cloud Masking + +`apply_scl_cloud_mask(image, scl_band)` zeroes out pixels classified as cloud, shadow, snow/ice, or no-data using the Sentinel-2 Scene Classification Layer (SCL). This must run **before** the model forward pass. + +Valid SCL classes kept: 4 (vegetation), 5 (bare soil), 6 (water), 7 (low cloud), 10 (thin cirrus). +Masked out: 0 (no-data), 1 (saturated), 2 (dark), 3 (shadow), 8/9 (medium/high cloud), 11 (snow/ice). + +## Synthetic Fallback + +If GEE auth fails, the downloader returns a deterministic synthetic tile seeded by the bbox so the same region always yields the same fallback. The metadata always includes `is_synthetic: true` so the API can warn the caller. + +## Environment + +``` +GEE_PROJECT_ID=your-project-id +GEE_SERVICE_ACCOUNT=svc@project.iam.gserviceaccount.com +GEE_SERVICE_ACCOUNT_KEY=secrets/gee-key.json +``` + +Run `python scripts/setup_gee.py` to verify credentials. diff --git a/team_docs/Adeolu_Mary_Oshadare_Role.pdf b/team_docs/Adeolu_Mary_Oshadare_Role.pdf new file mode 100644 index 0000000..51b940e Binary files /dev/null and b/team_docs/Adeolu_Mary_Oshadare_Role.pdf differ diff --git a/tests/test_band_mapping.py b/tests/test_band_mapping.py new file mode 100644 index 0000000..4f5832a --- /dev/null +++ b/tests/test_band_mapping.py @@ -0,0 +1,83 @@ +"""Smoke tests for analysis-aware Sentinel-2 band mapping.""" +from __future__ import annotations + +import pytest + +from climatevision.data.band_mapping import ( + SCL_BAND, + SENTINEL2_BAND_ORDER, + get_band_indices, + get_bands_for_analysis, + get_bands_for_analysis_with_scl, + get_model_config, + is_analysis_enabled, + list_enabled_analysis_types, +) + + +def test_sentinel2_band_order_has_13_bands(): + assert len(SENTINEL2_BAND_ORDER) == 13 + assert SENTINEL2_BAND_ORDER[0] == "B01" + assert SENTINEL2_BAND_ORDER[-1] == "B12" + + +def test_deforestation_uses_four_bands(): + bands = get_bands_for_analysis("deforestation") + assert len(bands) == 4 + assert set(bands) == {"B02", "B03", "B04", "B08"} + + +def test_flooding_uses_three_bands(): + bands = get_bands_for_analysis("flooding") + assert len(bands) == 3 + assert "B11" in bands + + +def test_ice_melting_uses_swir(): + bands = get_bands_for_analysis("ice_melting") + assert "B11" in bands + + +def test_scl_appended_for_cloud_masking(): + bands = get_bands_for_analysis_with_scl("deforestation") + assert SCL_BAND in bands + assert bands[-1] == SCL_BAND + + +def test_scl_not_duplicated(): + bands_with_scl = get_bands_for_analysis_with_scl("deforestation") + bands_again = get_bands_for_analysis_with_scl("deforestation") + assert bands_with_scl.count(SCL_BAND) == 1 + assert bands_again.count(SCL_BAND) == 1 + + +def test_band_indices_resolve_correctly(): + indices = get_band_indices(["B04", "B03", "B02", "B08"]) + assert indices == [3, 2, 1, 7] + + +def test_band_indices_rejects_unknown(): + with pytest.raises(ValueError, match="Unknown"): + get_band_indices(["B99"]) + + +def test_band_indices_rejects_scl_directly(): + with pytest.raises(ValueError, match="SCL"): + get_band_indices([SCL_BAND]) + + +def test_enabled_analysis_types_include_active_three(): + enabled = list_enabled_analysis_types() + for name in ("deforestation", "ice_melting", "flooding"): + assert name in enabled, f"{name} should be enabled" + + +def test_disabled_analysis_types(): + assert not is_analysis_enabled("drought") + assert not is_analysis_enabled("wildfire") + + +def test_model_config_carries_channels_and_classes(): + cfg = get_model_config("flooding") + assert cfg["in_channels"] == 3 + assert cfg["num_classes"] == 3